aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-devices-node7
-rw-r--r--Documentation/filesystems/tmpfs.txt10
-rw-r--r--Documentation/filesystems/xfs-delayed-logging-design.txt816
-rw-r--r--Documentation/kernel-parameters.txt15
-rw-r--r--Documentation/sysctl/vm.txt25
-rw-r--r--Documentation/watchdog/00-INDEX5
-rw-r--r--Documentation/watchdog/watchdog-parameters.txt390
-rw-r--r--Documentation/watchdog/wdt.txt15
-rw-r--r--arch/alpha/math-emu/sfp-util.h5
-rw-r--r--arch/arm/plat-samsung/include/plat/regs-rtc.h4
-rw-r--r--arch/frv/include/asm/cache.h2
-rw-r--r--arch/frv/include/asm/gdb-stub.h7
-rw-r--r--arch/frv/kernel/gdb-io.c4
-rw-r--r--arch/frv/kernel/gdb-stub.c61
-rw-r--r--arch/mn10300/include/asm/atomic.h158
-rw-r--r--arch/mn10300/include/asm/cache.h2
-rw-r--r--arch/powerpc/include/asm/sfp-machine.h6
-rw-r--r--arch/s390/include/asm/sfp-util.h2
-rw-r--r--arch/sh/math-emu/sfp-util.h4
-rw-r--r--arch/sparc/math-emu/sfp-util_32.h6
-rw-r--r--arch/sparc/math-emu/sfp-util_64.h6
-rw-r--r--arch/x86/boot/compressed/relocs.c4
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/xtensa/include/asm/cache.h1
-rw-r--r--arch/xtensa/include/asm/hardirq.h15
-rw-r--r--arch/xtensa/kernel/irq.c9
-rw-r--r--arch/xtensa/kernel/vectors.S2
-rw-r--r--drivers/acpi/bus.c9
-rw-r--r--drivers/auxdisplay/cfag12864bfb.c8
-rw-r--r--drivers/base/node.c3
-rw-r--r--drivers/char/hangcheck-timer.c20
-rw-r--r--drivers/char/hvsi.c6
-rw-r--r--drivers/char/misc.c1
-rw-r--r--drivers/cpuidle/governors/menu.c60
-rw-r--r--drivers/dma/timb_dma.c2
-rw-r--r--drivers/hwmon/Kconfig9
-rw-r--r--drivers/hwmon/Makefile1
-rw-r--r--drivers/hwmon/ads7871.c253
-rw-r--r--drivers/hwmon/coretemp.c93
-rw-r--r--drivers/hwmon/lis3lv02d.c245
-rw-r--r--drivers/hwmon/lis3lv02d.h11
-rw-r--r--drivers/isdn/gigaset/capi.c13
-rw-r--r--drivers/misc/Kconfig32
-rw-r--r--drivers/misc/Makefile2
-rw-r--r--drivers/misc/ad525x_dpot-i2c.c134
-rw-r--r--drivers/misc/ad525x_dpot-spi.c172
-rw-r--r--drivers/misc/ad525x_dpot.c1016
-rw-r--r--drivers/misc/ad525x_dpot.h202
-rw-r--r--drivers/net/wireless/airo.c15
-rw-r--r--drivers/power/power_supply_sysfs.c6
-rw-r--r--drivers/rtc/Kconfig2
-rw-r--r--drivers/rtc/rtc-cmos.c5
-rw-r--r--drivers/rtc/rtc-ds1302.c85
-rw-r--r--drivers/rtc/rtc-isl1208.c45
-rw-r--r--drivers/rtc/rtc-mxc.c25
-rw-r--r--drivers/rtc/rtc-s3c.c107
-rw-r--r--drivers/rtc/rtc-wm831x.c16
-rw-r--r--drivers/scsi/fcoe/fcoe.c2
-rw-r--r--drivers/scsi/mpt2sas/mpt2sas_base.c2
-rw-r--r--drivers/scsi/mpt2sas/mpt2sas_config.c2
-rw-r--r--drivers/serial/68328serial.c2
-rw-r--r--drivers/staging/rt2860/common/rtmp_init.c15
-rw-r--r--drivers/staging/rt2860/rtmp.h2
-rw-r--r--drivers/usb/atm/speedtch.c5
-rw-r--r--drivers/vhost/vhost.c2
-rw-r--r--drivers/video/arcfb.c8
-rw-r--r--drivers/video/aty/atyfb_base.c4
-rw-r--r--drivers/video/bfin-lq035q1-fb.c252
-rw-r--r--drivers/video/da8xx-fb.c301
-rw-r--r--drivers/video/fb_defio.c40
-rw-r--r--drivers/video/hgafb.c10
-rw-r--r--drivers/video/hitfb.c8
-rw-r--r--drivers/video/intelfb/intelfb.h4
-rw-r--r--drivers/video/nuc900fb.c2
-rw-r--r--drivers/video/s3c2410fb.c10
-rw-r--r--drivers/video/sgivwfb.c10
-rw-r--r--drivers/video/sis/sis_main.c2
-rw-r--r--drivers/video/vfb.c4
-rw-r--r--drivers/video/vga16fb.c10
-rw-r--r--drivers/video/w100fb.c10
-rw-r--r--drivers/watchdog/Kconfig26
-rw-r--r--drivers/watchdog/Makefile1
-rw-r--r--drivers/watchdog/bfin_wdt.c19
-rw-r--r--drivers/watchdog/booke_wdt.c6
-rw-r--r--drivers/watchdog/eurotechwdt.c1
-rw-r--r--drivers/watchdog/iTCO_vendor_support.c11
-rw-r--r--drivers/watchdog/iTCO_wdt.c29
-rw-r--r--drivers/watchdog/imx2_wdt.c358
-rw-r--r--drivers/watchdog/mpc8xxx_wdt.c2
-rw-r--r--drivers/watchdog/pc87413_wdt.c9
-rw-r--r--drivers/watchdog/pnx833x_wdt.c11
-rw-r--r--drivers/watchdog/s3c2410_wdt.c9
-rw-r--r--drivers/watchdog/shwdt.c2
-rw-r--r--drivers/watchdog/twl4030_wdt.c2
-rw-r--r--drivers/watchdog/wdt.c2
-rw-r--r--drivers/watchdog/wdt977.c2
-rw-r--r--drivers/xen/manage.c14
-rw-r--r--fs/exec.c7
-rw-r--r--fs/fat/cache.c13
-rw-r--r--fs/fat/fat.h12
-rw-r--r--fs/fat/inode.c2
-rw-r--r--fs/fat/misc.c22
-rw-r--r--fs/fs-writeback.c4
-rw-r--r--fs/gfs2/acl.c4
-rw-r--r--fs/gfs2/file.c7
-rw-r--r--fs/gfs2/inode.c54
-rw-r--r--fs/gfs2/inode.h3
-rw-r--r--fs/gfs2/log.c2
-rw-r--r--fs/gfs2/log.h29
-rw-r--r--fs/gfs2/rgrp.c20
-rw-r--r--fs/nfs/super.c4
-rw-r--r--fs/nfsd/nfsctl.c4
-rw-r--r--fs/ntfs/file.c28
-rw-r--r--fs/ocfs2/blockcheck.c4
-rw-r--r--fs/partitions/ldm.c18
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--fs/smbfs/symlink.c1
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c12
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h83
-rw-r--r--fs/xfs/quota/xfs_dquot.c6
-rw-r--r--fs/xfs/xfs_ag.h24
-rw-r--r--fs/xfs/xfs_alloc.c357
-rw-r--r--fs/xfs/xfs_alloc.h7
-rw-r--r--fs/xfs/xfs_alloc_btree.c2
-rw-r--r--fs/xfs/xfs_buf_item.c166
-rw-r--r--fs/xfs/xfs_buf_item.h18
-rw-r--r--fs/xfs/xfs_error.c2
-rw-r--r--fs/xfs/xfs_log.c120
-rw-r--r--fs/xfs/xfs_log.h14
-rw-r--r--fs/xfs/xfs_log_cil.c725
-rw-r--r--fs/xfs/xfs_log_priv.h118
-rw-r--r--fs/xfs/xfs_log_recover.c46
-rw-r--r--fs/xfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_trans.c144
-rw-r--r--fs/xfs/xfs_trans.h44
-rw-r--r--fs/xfs/xfs_trans_buf.c46
-rw-r--r--fs/xfs/xfs_trans_item.c114
-rw-r--r--fs/xfs/xfs_trans_priv.h15
-rw-r--r--fs/xfs/xfs_types.h2
-rw-r--r--include/asm-generic/atomic.h8
-rw-r--r--include/asm-generic/kmap_types.h3
-rw-r--r--include/linux/byteorder/big_endian.h3
-rw-r--r--include/linux/byteorder/little_endian.h3
-rw-r--r--include/linux/compaction.h89
-rw-r--r--include/linux/cpuset.h43
-rw-r--r--include/linux/dynamic_debug.h2
-rw-r--r--include/linux/err.h10
-rw-r--r--include/linux/fb.h2
-rw-r--r--include/linux/gfp.h18
-rw-r--r--include/linux/highmem.h2
-rw-r--r--include/linux/ivtvfb.h1
-rw-r--r--include/linux/kernel.h25
-rw-r--r--include/linux/lis3lv02d.h12
-rw-r--r--include/linux/matroxfb.h3
-rw-r--r--include/linux/memcontrol.h13
-rw-r--r--include/linux/memory_hotplug.h1
-rw-r--r--include/linux/mempolicy.h15
-rw-r--r--include/linux/migrate.h6
-rw-r--r--include/linux/mm.h7
-rw-r--r--include/linux/mmzone.h14
-rw-r--r--include/linux/ratelimit.h13
-rw-r--r--include/linux/rmap.h27
-rw-r--r--include/linux/sched.h3
-rw-r--r--include/linux/swap.h14
-rw-r--r--include/linux/vmstat.h4
-rw-r--r--include/net/ip.h6
-rw-r--r--include/net/ipv6.h6
-rw-r--r--include/video/da8xx-fb.h1
-rw-r--r--include/video/sh_mobile_lcdc.h2
-rw-r--r--init/main.c2
-rw-r--r--ipc/msg.c12
-rw-r--r--ipc/util.c4
-rw-r--r--kernel/cpu.c28
-rw-r--r--kernel/cpuset.c58
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/sysctl.c25
-rw-r--r--kernel/sysctl_binary.c9
-rw-r--r--lib/Kconfig.debug16
-rw-r--r--lib/crc32.c26
-rw-r--r--lib/dynamic_debug.c2
-rw-r--r--lib/gen_crc32table.c47
-rw-r--r--lib/hexdump.c54
-rw-r--r--lib/vsprintf.c69
-rw-r--r--mm/Kconfig17
-rw-r--r--mm/Makefile1
-rw-r--r--mm/compaction.c605
-rw-r--r--mm/filemap.c14
-rw-r--r--mm/highmem.c2
-rw-r--r--mm/hugetlb.c12
-rw-r--r--mm/ksm.c4
-rw-r--r--mm/memory.c13
-rw-r--r--mm/memory_hotplug.c36
-rw-r--r--mm/mempolicy.c226
-rw-r--r--mm/migrate.c72
-rw-r--r--mm/mincore.c263
-rw-r--r--mm/page_alloc.c267
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c40
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/slab.c4
-rw-r--r--mm/slub.c6
-rw-r--r--mm/sparse.c9
-rw-r--r--mm/vmscan.c213
-rw-r--r--mm/vmstat.c253
-rw-r--r--net/9p/protocol.c2
-rw-r--r--net/dccp/options.c2
-rw-r--r--net/ipv4/udp.c8
-rw-r--r--net/mac80211/sta_info.c2
-rw-r--r--net/sunrpc/rpcb_clnt.c2
-rw-r--r--net/sunrpc/xprt.c5
-rwxr-xr-xscripts/checkpatch.pl20
-rwxr-xr-xscripts/get_maintainer.pl66
-rw-r--r--security/keys/keyring.c6
218 files changed, 8118 insertions, 2481 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-node b/Documentation/ABI/testing/sysfs-devices-node
new file mode 100644
index 000000000000..453a210c3ceb
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-node
@@ -0,0 +1,7 @@
1What: /sys/devices/system/node/nodeX/compact
2Date: February 2010
3Contact: Mel Gorman <mel@csn.ul.ie>
4Description:
5 When this file is written to, all memory within that node
6 will be compacted. When it completes, memory will be freed
7 into blocks which have as many contiguous pages as possible
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index fe09a2cb1858..98ef55124158 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -94,11 +94,19 @@ NodeList format is a comma-separated list of decimal numbers and ranges,
94a range being two hyphen-separated decimal numbers, the smallest and 94a range being two hyphen-separated decimal numbers, the smallest and
95largest node numbers in the range. For example, mpol=bind:0-3,5,7,9-15 95largest node numbers in the range. For example, mpol=bind:0-3,5,7,9-15
96 96
97A memory policy with a valid NodeList will be saved, as specified, for
98use at file creation time. When a task allocates a file in the file
99system, the mount option memory policy will be applied with a NodeList,
100if any, modified by the calling task's cpuset constraints
101[See Documentation/cgroups/cpusets.txt] and any optional flags, listed
102below. If the resulting NodeLists is the empty set, the effective memory
103policy for the file will revert to "default" policy.
104
97NUMA memory allocation policies have optional flags that can be used in 105NUMA memory allocation policies have optional flags that can be used in
98conjunction with their modes. These optional flags can be specified 106conjunction with their modes. These optional flags can be specified
99when tmpfs is mounted by appending them to the mode before the NodeList. 107when tmpfs is mounted by appending them to the mode before the NodeList.
100See Documentation/vm/numa_memory_policy.txt for a list of all available 108See Documentation/vm/numa_memory_policy.txt for a list of all available
101memory allocation policy mode flags. 109memory allocation policy mode flags and their effect on memory policy.
102 110
103 =static is equivalent to MPOL_F_STATIC_NODES 111 =static is equivalent to MPOL_F_STATIC_NODES
104 =relative is equivalent to MPOL_F_RELATIVE_NODES 112 =relative is equivalent to MPOL_F_RELATIVE_NODES
diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt
new file mode 100644
index 000000000000..d8119e9d2d60
--- /dev/null
+++ b/Documentation/filesystems/xfs-delayed-logging-design.txt
@@ -0,0 +1,816 @@
1XFS Delayed Logging Design
2--------------------------
3
4Introduction to Re-logging in XFS
5---------------------------------
6
7XFS logging is a combination of logical and physical logging. Some objects,
8such as inodes and dquots, are logged in logical format where the details
9logged are made up of the changes to in-core structures rather than on-disk
10structures. Other objects - typically buffers - have their physical changes
11logged. The reason for these differences is to reduce the amount of log space
12required for objects that are frequently logged. Some parts of inodes are more
13frequently logged than others, and inodes are typically more frequently logged
14than any other object (except maybe the superblock buffer) so keeping the
15amount of metadata logged low is of prime importance.
16
17The reason that this is such a concern is that XFS allows multiple separate
18modifications to a single object to be carried in the log at any given time.
19This allows the log to avoid needing to flush each change to disk before
20recording a new change to the object. XFS does this via a method called
21"re-logging". Conceptually, this is quite simple - all it requires is that any
22new change to the object is recorded with a *new copy* of all the existing
23changes in the new transaction that is written to the log.
24
25That is, if we have a sequence of changes A through to F, and the object was
26written to disk after change D, we would see in the log the following series
27of transactions, their contents and the log sequence number (LSN) of the
28transaction:
29
30 Transaction Contents LSN
31 A A X
32 B A+B X+n
33 C A+B+C X+n+m
34 D A+B+C+D X+n+m+o
35 <object written to disk>
36 E E Y (> X+n+m+o)
37 F E+F Yٍ+p
38
39In other words, each time an object is relogged, the new transaction contains
40the aggregation of all the previous changes currently held only in the log.
41
42This relogging technique also allows objects to be moved forward in the log so
43that an object being relogged does not prevent the tail of the log from ever
44moving forward. This can be seen in the table above by the changing
45(increasing) LSN of each subsquent transaction - the LSN is effectively a
46direct encoding of the location in the log of the transaction.
47
48This relogging is also used to implement long-running, multiple-commit
49transactions. These transaction are known as rolling transactions, and require
50a special log reservation known as a permanent transaction reservation. A
51typical example of a rolling transaction is the removal of extents from an
52inode which can only be done at a rate of two extents per transaction because
53of reservation size limitations. Hence a rolling extent removal transaction
54keeps relogging the inode and btree buffers as they get modified in each
55removal operation. This keeps them moving forward in the log as the operation
56progresses, ensuring that current operation never gets blocked by itself if the
57log wraps around.
58
59Hence it can be seen that the relogging operation is fundamental to the correct
60working of the XFS journalling subsystem. From the above description, most
61people should be able to see why the XFS metadata operations writes so much to
62the log - repeated operations to the same objects write the same changes to
63the log over and over again. Worse is the fact that objects tend to get
64dirtier as they get relogged, so each subsequent transaction is writing more
65metadata into the log.
66
67Another feature of the XFS transaction subsystem is that most transactions are
68asynchronous. That is, they don't commit to disk until either a log buffer is
69filled (a log buffer can hold multiple transactions) or a synchronous operation
70forces the log buffers holding the transactions to disk. This means that XFS is
71doing aggregation of transactions in memory - batching them, if you like - to
72minimise the impact of the log IO on transaction throughput.
73
74The limitation on asynchronous transaction throughput is the number and size of
75log buffers made available by the log manager. By default there are 8 log
76buffers available and the size of each is 32kB - the size can be increased up
77to 256kB by use of a mount option.
78
79Effectively, this gives us the maximum bound of outstanding metadata changes
80that can be made to the filesystem at any point in time - if all the log
81buffers are full and under IO, then no more transactions can be committed until
82the current batch completes. It is now common for a single current CPU core to
83be to able to issue enough transactions to keep the log buffers full and under
84IO permanently. Hence the XFS journalling subsystem can be considered to be IO
85bound.
86
87Delayed Logging: Concepts
88-------------------------
89
90The key thing to note about the asynchronous logging combined with the
91relogging technique XFS uses is that we can be relogging changed objects
92multiple times before they are committed to disk in the log buffers. If we
93return to the previous relogging example, it is entirely possible that
94transactions A through D are committed to disk in the same log buffer.
95
96That is, a single log buffer may contain multiple copies of the same object,
97but only one of those copies needs to be there - the last one "D", as it
98contains all the changes from the previous changes. In other words, we have one
99necessary copy in the log buffer, and three stale copies that are simply
100wasting space. When we are doing repeated operations on the same set of
101objects, these "stale objects" can be over 90% of the space used in the log
102buffers. It is clear that reducing the number of stale objects written to the
103log would greatly reduce the amount of metadata we write to the log, and this
104is the fundamental goal of delayed logging.
105
106From a conceptual point of view, XFS is already doing relogging in memory (where
107memory == log buffer), only it is doing it extremely inefficiently. It is using
108logical to physical formatting to do the relogging because there is no
109infrastructure to keep track of logical changes in memory prior to physically
110formatting the changes in a transaction to the log buffer. Hence we cannot avoid
111accumulating stale objects in the log buffers.
112
113Delayed logging is the name we've given to keeping and tracking transactional
114changes to objects in memory outside the log buffer infrastructure. Because of
115the relogging concept fundamental to the XFS journalling subsystem, this is
116actually relatively easy to do - all the changes to logged items are already
117tracked in the current infrastructure. The big problem is how to accumulate
118them and get them to the log in a consistent, recoverable manner.
119Describing the problems and how they have been solved is the focus of this
120document.
121
122One of the key changes that delayed logging makes to the operation of the
123journalling subsystem is that it disassociates the amount of outstanding
124metadata changes from the size and number of log buffers available. In other
125words, instead of there only being a maximum of 2MB of transaction changes not
126written to the log at any point in time, there may be a much greater amount
127being accumulated in memory. Hence the potential for loss of metadata on a
128crash is much greater than for the existing logging mechanism.
129
130It should be noted that this does not change the guarantee that log recovery
131will result in a consistent filesystem. What it does mean is that as far as the
132recovered filesystem is concerned, there may be many thousands of transactions
133that simply did not occur as a result of the crash. This makes it even more
134important that applications that care about their data use fsync() where they
135need to ensure application level data integrity is maintained.
136
137It should be noted that delayed logging is not an innovative new concept that
138warrants rigorous proofs to determine whether it is correct or not. The method
139of accumulating changes in memory for some period before writing them to the
140log is used effectively in many filesystems including ext3 and ext4. Hence
141no time is spent in this document trying to convince the reader that the
142concept is sound. Instead it is simply considered a "solved problem" and as
143such implementing it in XFS is purely an exercise in software engineering.
144
145The fundamental requirements for delayed logging in XFS are simple:
146
147 1. Reduce the amount of metadata written to the log by at least
148 an order of magnitude.
149 2. Supply sufficient statistics to validate Requirement #1.
150 3. Supply sufficient new tracing infrastructure to be able to debug
151 problems with the new code.
152 4. No on-disk format change (metadata or log format).
153 5. Enable and disable with a mount option.
154 6. No performance regressions for synchronous transaction workloads.
155
156Delayed Logging: Design
157-----------------------
158
159Storing Changes
160
161The problem with accumulating changes at a logical level (i.e. just using the
162existing log item dirty region tracking) is that when it comes to writing the
163changes to the log buffers, we need to ensure that the object we are formatting
164is not changing while we do this. This requires locking the object to prevent
165concurrent modification. Hence flushing the logical changes to the log would
166require us to lock every object, format them, and then unlock them again.
167
168This introduces lots of scope for deadlocks with transactions that are already
169running. For example, a transaction has object A locked and modified, but needs
170the delayed logging tracking lock to commit the transaction. However, the
171flushing thread has the delayed logging tracking lock already held, and is
172trying to get the lock on object A to flush it to the log buffer. This appears
173to be an unsolvable deadlock condition, and it was solving this problem that
174was the barrier to implementing delayed logging for so long.
175
176The solution is relatively simple - it just took a long time to recognise it.
177Put simply, the current logging code formats the changes to each item into an
178vector array that points to the changed regions in the item. The log write code
179simply copies the memory these vectors point to into the log buffer during
180transaction commit while the item is locked in the transaction. Instead of
181using the log buffer as the destination of the formatting code, we can use an
182allocated memory buffer big enough to fit the formatted vector.
183
184If we then copy the vector into the memory buffer and rewrite the vector to
185point to the memory buffer rather than the object itself, we now have a copy of
186the changes in a format that is compatible with the log buffer writing code.
187that does not require us to lock the item to access. This formatting and
188rewriting can all be done while the object is locked during transaction commit,
189resulting in a vector that is transactionally consistent and can be accessed
190without needing to lock the owning item.
191
192Hence we avoid the need to lock items when we need to flush outstanding
193asynchronous transactions to the log. The differences between the existing
194formatting method and the delayed logging formatting can be seen in the
195diagram below.
196
197Current format log vector:
198
199Object +---------------------------------------------+
200Vector 1 +----+
201Vector 2 +----+
202Vector 3 +----------+
203
204After formatting:
205
206Log Buffer +-V1-+-V2-+----V3----+
207
208Delayed logging vector:
209
210Object +---------------------------------------------+
211Vector 1 +----+
212Vector 2 +----+
213Vector 3 +----------+
214
215After formatting:
216
217Memory Buffer +-V1-+-V2-+----V3----+
218Vector 1 +----+
219Vector 2 +----+
220Vector 3 +----------+
221
222The memory buffer and associated vector need to be passed as a single object,
223but still need to be associated with the parent object so if the object is
224relogged we can replace the current memory buffer with a new memory buffer that
225contains the latest changes.
226
227The reason for keeping the vector around after we've formatted the memory
228buffer is to support splitting vectors across log buffer boundaries correctly.
229If we don't keep the vector around, we do not know where the region boundaries
230are in the item, so we'd need a new encapsulation method for regions in the log
231buffer writing (i.e. double encapsulation). This would be an on-disk format
232change and as such is not desirable. It also means we'd have to write the log
233region headers in the formatting stage, which is problematic as there is per
234region state that needs to be placed into the headers during the log write.
235
236Hence we need to keep the vector, but by attaching the memory buffer to it and
237rewriting the vector addresses to point at the memory buffer we end up with a
238self-describing object that can be passed to the log buffer write code to be
239handled in exactly the same manner as the existing log vectors are handled.
240Hence we avoid needing a new on-disk format to handle items that have been
241relogged in memory.
242
243
244Tracking Changes
245
246Now that we can record transactional changes in memory in a form that allows
247them to be used without limitations, we need to be able to track and accumulate
248them so that they can be written to the log at some later point in time. The
249log item is the natural place to store this vector and buffer, and also makes sense
250to be the object that is used to track committed objects as it will always
251exist once the object has been included in a transaction.
252
253The log item is already used to track the log items that have been written to
254the log but not yet written to disk. Such log items are considered "active"
255and as such are stored in the Active Item List (AIL) which is a LSN-ordered
256double linked list. Items are inserted into this list during log buffer IO
257completion, after which they are unpinned and can be written to disk. An object
258that is in the AIL can be relogged, which causes the object to be pinned again
259and then moved forward in the AIL when the log buffer IO completes for that
260transaction.
261
262Essentially, this shows that an item that is in the AIL can still be modified
263and relogged, so any tracking must be separate to the AIL infrastructure. As
264such, we cannot reuse the AIL list pointers for tracking committed items, nor
265can we store state in any field that is protected by the AIL lock. Hence the
266committed item tracking needs it's own locks, lists and state fields in the log
267item.
268
269Similar to the AIL, tracking of committed items is done through a new list
270called the Committed Item List (CIL). The list tracks log items that have been
271committed and have formatted memory buffers attached to them. It tracks objects
272in transaction commit order, so when an object is relogged it is removed from
273it's place in the list and re-inserted at the tail. This is entirely arbitrary
274and done to make it easy for debugging - the last items in the list are the
275ones that are most recently modified. Ordering of the CIL is not necessary for
276transactional integrity (as discussed in the next section) so the ordering is
277done for convenience/sanity of the developers.
278
279
280Delayed Logging: Checkpoints
281
282When we have a log synchronisation event, commonly known as a "log force",
283all the items in the CIL must be written into the log via the log buffers.
284We need to write these items in the order that they exist in the CIL, and they
285need to be written as an atomic transaction. The need for all the objects to be
286written as an atomic transaction comes from the requirements of relogging and
287log replay - all the changes in all the objects in a given transaction must
288either be completely replayed during log recovery, or not replayed at all. If
289a transaction is not replayed because it is not complete in the log, then
290no later transactions should be replayed, either.
291
292To fulfill this requirement, we need to write the entire CIL in a single log
293transaction. Fortunately, the XFS log code has no fixed limit on the size of a
294transaction, nor does the log replay code. The only fundamental limit is that
295the transaction cannot be larger than just under half the size of the log. The
296reason for this limit is that to find the head and tail of the log, there must
297be at least one complete transaction in the log at any given time. If a
298transaction is larger than half the log, then there is the possibility that a
299crash during the write of a such a transaction could partially overwrite the
300only complete previous transaction in the log. This will result in a recovery
301failure and an inconsistent filesystem and hence we must enforce the maximum
302size of a checkpoint to be slightly less than a half the log.
303
304Apart from this size requirement, a checkpoint transaction looks no different
305to any other transaction - it contains a transaction header, a series of
306formatted log items and a commit record at the tail. From a recovery
307perspective, the checkpoint transaction is also no different - just a lot
308bigger with a lot more items in it. The worst case effect of this is that we
309might need to tune the recovery transaction object hash size.
310
311Because the checkpoint is just another transaction and all the changes to log
312items are stored as log vectors, we can use the existing log buffer writing
313code to write the changes into the log. To do this efficiently, we need to
314minimise the time we hold the CIL locked while writing the checkpoint
315transaction. The current log write code enables us to do this easily with the
316way it separates the writing of the transaction contents (the log vectors) from
317the transaction commit record, but tracking this requires us to have a
318per-checkpoint context that travels through the log write process through to
319checkpoint completion.
320
321Hence a checkpoint has a context that tracks the state of the current
322checkpoint from initiation to checkpoint completion. A new context is initiated
323at the same time a checkpoint transaction is started. That is, when we remove
324all the current items from the CIL during a checkpoint operation, we move all
325those changes into the current checkpoint context. We then initialise a new
326context and attach that to the CIL for aggregation of new transactions.
327
328This allows us to unlock the CIL immediately after transfer of all the
329committed items and effectively allow new transactions to be issued while we
330are formatting the checkpoint into the log. It also allows concurrent
331checkpoints to be written into the log buffers in the case of log force heavy
332workloads, just like the existing transaction commit code does. This, however,
333requires that we strictly order the commit records in the log so that
334checkpoint sequence order is maintained during log replay.
335
336To ensure that we can be writing an item into a checkpoint transaction at
337the same time another transaction modifies the item and inserts the log item
338into the new CIL, then checkpoint transaction commit code cannot use log items
339to store the list of log vectors that need to be written into the transaction.
340Hence log vectors need to be able to be chained together to allow them to be
341detatched from the log items. That is, when the CIL is flushed the memory
342buffer and log vector attached to each log item needs to be attached to the
343checkpoint context so that the log item can be released. In diagrammatic form,
344the CIL would look like this before the flush:
345
346 CIL Head
347 |
348 V
349 Log Item <-> log vector 1 -> memory buffer
350 | -> vector array
351 V
352 Log Item <-> log vector 2 -> memory buffer
353 | -> vector array
354 V
355 ......
356 |
357 V
358 Log Item <-> log vector N-1 -> memory buffer
359 | -> vector array
360 V
361 Log Item <-> log vector N -> memory buffer
362 -> vector array
363
364And after the flush the CIL head is empty, and the checkpoint context log
365vector list would look like:
366
367 Checkpoint Context
368 |
369 V
370 log vector 1 -> memory buffer
371 | -> vector array
372 | -> Log Item
373 V
374 log vector 2 -> memory buffer
375 | -> vector array
376 | -> Log Item
377 V
378 ......
379 |
380 V
381 log vector N-1 -> memory buffer
382 | -> vector array
383 | -> Log Item
384 V
385 log vector N -> memory buffer
386 -> vector array
387 -> Log Item
388
389Once this transfer is done, the CIL can be unlocked and new transactions can
390start, while the checkpoint flush code works over the log vector chain to
391commit the checkpoint.
392
393Once the checkpoint is written into the log buffers, the checkpoint context is
394attached to the log buffer that the commit record was written to along with a
395completion callback. Log IO completion will call that callback, which can then
396run transaction committed processing for the log items (i.e. insert into AIL
397and unpin) in the log vector chain and then free the log vector chain and
398checkpoint context.
399
400Discussion Point: I am uncertain as to whether the log item is the most
401efficient way to track vectors, even though it seems like the natural way to do
402it. The fact that we walk the log items (in the CIL) just to chain the log
403vectors and break the link between the log item and the log vector means that
404we take a cache line hit for the log item list modification, then another for
405the log vector chaining. If we track by the log vectors, then we only need to
406break the link between the log item and the log vector, which means we should
407dirty only the log item cachelines. Normally I wouldn't be concerned about one
408vs two dirty cachelines except for the fact I've seen upwards of 80,000 log
409vectors in one checkpoint transaction. I'd guess this is a "measure and
410compare" situation that can be done after a working and reviewed implementation
411is in the dev tree....
412
413Delayed Logging: Checkpoint Sequencing
414
415One of the key aspects of the XFS transaction subsystem is that it tags
416committed transactions with the log sequence number of the transaction commit.
417This allows transactions to be issued asynchronously even though there may be
418future operations that cannot be completed until that transaction is fully
419committed to the log. In the rare case that a dependent operation occurs (e.g.
420re-using a freed metadata extent for a data extent), a special, optimised log
421force can be issued to force the dependent transaction to disk immediately.
422
423To do this, transactions need to record the LSN of the commit record of the
424transaction. This LSN comes directly from the log buffer the transaction is
425written into. While this works just fine for the existing transaction
426mechanism, it does not work for delayed logging because transactions are not
427written directly into the log buffers. Hence some other method of sequencing
428transactions is required.
429
430As discussed in the checkpoint section, delayed logging uses per-checkpoint
431contexts, and as such it is simple to assign a sequence number to each
432checkpoint. Because the switching of checkpoint contexts must be done
433atomically, it is simple to ensure that each new context has a monotonically
434increasing sequence number assigned to it without the need for an external
435atomic counter - we can just take the current context sequence number and add
436one to it for the new context.
437
438Then, instead of assigning a log buffer LSN to the transaction commit LSN
439during the commit, we can assign the current checkpoint sequence. This allows
440operations that track transactions that have not yet completed know what
441checkpoint sequence needs to be committed before they can continue. As a
442result, the code that forces the log to a specific LSN now needs to ensure that
443the log forces to a specific checkpoint.
444
445To ensure that we can do this, we need to track all the checkpoint contexts
446that are currently committing to the log. When we flush a checkpoint, the
447context gets added to a "committing" list which can be searched. When a
448checkpoint commit completes, it is removed from the committing list. Because
449the checkpoint context records the LSN of the commit record for the checkpoint,
450we can also wait on the log buffer that contains the commit record, thereby
451using the existing log force mechanisms to execute synchronous forces.
452
453It should be noted that the synchronous forces may need to be extended with
454mitigation algorithms similar to the current log buffer code to allow
455aggregation of multiple synchronous transactions if there are already
456synchronous transactions being flushed. Investigation of the performance of the
457current design is needed before making any decisions here.
458
459The main concern with log forces is to ensure that all the previous checkpoints
460are also committed to disk before the one we need to wait for. Therefore we
461need to check that all the prior contexts in the committing list are also
462complete before waiting on the one we need to complete. We do this
463synchronisation in the log force code so that we don't need to wait anywhere
464else for such serialisation - it only matters when we do a log force.
465
466The only remaining complexity is that a log force now also has to handle the
467case where the forcing sequence number is the same as the current context. That
468is, we need to flush the CIL and potentially wait for it to complete. This is a
469simple addition to the existing log forcing code to check the sequence numbers
470and push if required. Indeed, placing the current sequence checkpoint flush in
471the log force code enables the current mechanism for issuing synchronous
472transactions to remain untouched (i.e. commit an asynchronous transaction, then
473force the log at the LSN of that transaction) and so the higher level code
474behaves the same regardless of whether delayed logging is being used or not.
475
476Delayed Logging: Checkpoint Log Space Accounting
477
478The big issue for a checkpoint transaction is the log space reservation for the
479transaction. We don't know how big a checkpoint transaction is going to be
480ahead of time, nor how many log buffers it will take to write out, nor the
481number of split log vector regions are going to be used. We can track the
482amount of log space required as we add items to the commit item list, but we
483still need to reserve the space in the log for the checkpoint.
484
485A typical transaction reserves enough space in the log for the worst case space
486usage of the transaction. The reservation accounts for log record headers,
487transaction and region headers, headers for split regions, buffer tail padding,
488etc. as well as the actual space for all the changed metadata in the
489transaction. While some of this is fixed overhead, much of it is dependent on
490the size of the transaction and the number of regions being logged (the number
491of log vectors in the transaction).
492
493An example of the differences would be logging directory changes versus logging
494inode changes. If you modify lots of inode cores (e.g. chmod -R g+w *), then
495there are lots of transactions that only contain an inode core and an inode log
496format structure. That is, two vectors totaling roughly 150 bytes. If we modify
49710,000 inodes, we have about 1.5MB of metadata to write in 20,000 vectors. Each
498vector is 12 bytes, so the total to be logged is approximately 1.75MB. In
499comparison, if we are logging full directory buffers, they are typically 4KB
500each, so we in 1.5MB of directory buffers we'd have roughly 400 buffers and a
501buffer format structure for each buffer - roughly 800 vectors or 1.51MB total
502space. From this, it should be obvious that a static log space reservation is
503not particularly flexible and is difficult to select the "optimal value" for
504all workloads.
505
506Further, if we are going to use a static reservation, which bit of the entire
507reservation does it cover? We account for space used by the transaction
508reservation by tracking the space currently used by the object in the CIL and
509then calculating the increase or decrease in space used as the object is
510relogged. This allows for a checkpoint reservation to only have to account for
511log buffer metadata used such as log header records.
512
513However, even using a static reservation for just the log metadata is
514problematic. Typically log record headers use at least 16KB of log space per
5151MB of log space consumed (512 bytes per 32k) and the reservation needs to be
516large enough to handle arbitrary sized checkpoint transactions. This
517reservation needs to be made before the checkpoint is started, and we need to
518be able to reserve the space without sleeping. For a 8MB checkpoint, we need a
519reservation of around 150KB, which is a non-trivial amount of space.
520
521A static reservation needs to manipulate the log grant counters - we can take a
522permanent reservation on the space, but we still need to make sure we refresh
523the write reservation (the actual space available to the transaction) after
524every checkpoint transaction completion. Unfortunately, if this space is not
525available when required, then the regrant code will sleep waiting for it.
526
527The problem with this is that it can lead to deadlocks as we may need to commit
528checkpoints to be able to free up log space (refer back to the description of
529rolling transactions for an example of this). Hence we *must* always have
530space available in the log if we are to use static reservations, and that is
531very difficult and complex to arrange. It is possible to do, but there is a
532simpler way.
533
534The simpler way of doing this is tracking the entire log space used by the
535items in the CIL and using this to dynamically calculate the amount of log
536space required by the log metadata. If this log metadata space changes as a
537result of a transaction commit inserting a new memory buffer into the CIL, then
538the difference in space required is removed from the transaction that causes
539the change. Transactions at this level will *always* have enough space
540available in their reservation for this as they have already reserved the
541maximal amount of log metadata space they require, and such a delta reservation
542will always be less than or equal to the maximal amount in the reservation.
543
544Hence we can grow the checkpoint transaction reservation dynamically as items
545are added to the CIL and avoid the need for reserving and regranting log space
546up front. This avoids deadlocks and removes a blocking point from the
547checkpoint flush code.
548
549As mentioned early, transactions can't grow to more than half the size of the
550log. Hence as part of the reservation growing, we need to also check the size
551of the reservation against the maximum allowed transaction size. If we reach
552the maximum threshold, we need to push the CIL to the log. This is effectively
553a "background flush" and is done on demand. This is identical to
554a CIL push triggered by a log force, only that there is no waiting for the
555checkpoint commit to complete. This background push is checked and executed by
556transaction commit code.
557
558If the transaction subsystem goes idle while we still have items in the CIL,
559they will be flushed by the periodic log force issued by the xfssyncd. This log
560force will push the CIL to disk, and if the transaction subsystem stays idle,
561allow the idle log to be covered (effectively marked clean) in exactly the same
562manner that is done for the existing logging method. A discussion point is
563whether this log force needs to be done more frequently than the current rate
564which is once every 30s.
565
566
567Delayed Logging: Log Item Pinning
568
569Currently log items are pinned during transaction commit while the items are
570still locked. This happens just after the items are formatted, though it could
571be done any time before the items are unlocked. The result of this mechanism is
572that items get pinned once for every transaction that is committed to the log
573buffers. Hence items that are relogged in the log buffers will have a pin count
574for every outstanding transaction they were dirtied in. When each of these
575transactions is completed, they will unpin the item once. As a result, the item
576only becomes unpinned when all the transactions complete and there are no
577pending transactions. Thus the pinning and unpinning of a log item is symmetric
578as there is a 1:1 relationship with transaction commit and log item completion.
579
580For delayed logging, however, we have an assymetric transaction commit to
581completion relationship. Every time an object is relogged in the CIL it goes
582through the commit process without a corresponding completion being registered.
583That is, we now have a many-to-one relationship between transaction commit and
584log item completion. The result of this is that pinning and unpinning of the
585log items becomes unbalanced if we retain the "pin on transaction commit, unpin
586on transaction completion" model.
587
588To keep pin/unpin symmetry, the algorithm needs to change to a "pin on
589insertion into the CIL, unpin on checkpoint completion". In other words, the
590pinning and unpinning becomes symmetric around a checkpoint context. We have to
591pin the object the first time it is inserted into the CIL - if it is already in
592the CIL during a transaction commit, then we do not pin it again. Because there
593can be multiple outstanding checkpoint contexts, we can still see elevated pin
594counts, but as each checkpoint completes the pin count will retain the correct
595value according to it's context.
596
597Just to make matters more slightly more complex, this checkpoint level context
598for the pin count means that the pinning of an item must take place under the
599CIL commit/flush lock. If we pin the object outside this lock, we cannot
600guarantee which context the pin count is associated with. This is because of
601the fact pinning the item is dependent on whether the item is present in the
602current CIL or not. If we don't pin the CIL first before we check and pin the
603object, we have a race with CIL being flushed between the check and the pin
604(or not pinning, as the case may be). Hence we must hold the CIL flush/commit
605lock to guarantee that we pin the items correctly.
606
607Delayed Logging: Concurrent Scalability
608
609A fundamental requirement for the CIL is that accesses through transaction
610commits must scale to many concurrent commits. The current transaction commit
611code does not break down even when there are transactions coming from 2048
612processors at once. The current transaction code does not go any faster than if
613there was only one CPU using it, but it does not slow down either.
614
615As a result, the delayed logging transaction commit code needs to be designed
616for concurrency from the ground up. It is obvious that there are serialisation
617points in the design - the three important ones are:
618
619 1. Locking out new transaction commits while flushing the CIL
620 2. Adding items to the CIL and updating item space accounting
621 3. Checkpoint commit ordering
622
623Looking at the transaction commit and CIL flushing interactions, it is clear
624that we have a many-to-one interaction here. That is, the only restriction on
625the number of concurrent transactions that can be trying to commit at once is
626the amount of space available in the log for their reservations. The practical
627limit here is in the order of several hundred concurrent transactions for a
628128MB log, which means that it is generally one per CPU in a machine.
629
630The amount of time a transaction commit needs to hold out a flush is a
631relatively long period of time - the pinning of log items needs to be done
632while we are holding out a CIL flush, so at the moment that means it is held
633across the formatting of the objects into memory buffers (i.e. while memcpy()s
634are in progress). Ultimately a two pass algorithm where the formatting is done
635separately to the pinning of objects could be used to reduce the hold time of
636the transaction commit side.
637
638Because of the number of potential transaction commit side holders, the lock
639really needs to be a sleeping lock - if the CIL flush takes the lock, we do not
640want every other CPU in the machine spinning on the CIL lock. Given that
641flushing the CIL could involve walking a list of tens of thousands of log
642items, it will get held for a significant time and so spin contention is a
643significant concern. Preventing lots of CPUs spinning doing nothing is the
644main reason for choosing a sleeping lock even though nothing in either the
645transaction commit or CIL flush side sleeps with the lock held.
646
647It should also be noted that CIL flushing is also a relatively rare operation
648compared to transaction commit for asynchronous transaction workloads - only
649time will tell if using a read-write semaphore for exclusion will limit
650transaction commit concurrency due to cache line bouncing of the lock on the
651read side.
652
653The second serialisation point is on the transaction commit side where items
654are inserted into the CIL. Because transactions can enter this code
655concurrently, the CIL needs to be protected separately from the above
656commit/flush exclusion. It also needs to be an exclusive lock but it is only
657held for a very short time and so a spin lock is appropriate here. It is
658possible that this lock will become a contention point, but given the short
659hold time once per transaction I think that contention is unlikely.
660
661The final serialisation point is the checkpoint commit record ordering code
662that is run as part of the checkpoint commit and log force sequencing. The code
663path that triggers a CIL flush (i.e. whatever triggers the log force) will enter
664an ordering loop after writing all the log vectors into the log buffers but
665before writing the commit record. This loop walks the list of committing
666checkpoints and needs to block waiting for checkpoints to complete their commit
667record write. As a result it needs a lock and a wait variable. Log force
668sequencing also requires the same lock, list walk, and blocking mechanism to
669ensure completion of checkpoints.
670
671These two sequencing operations can use the mechanism even though the
672events they are waiting for are different. The checkpoint commit record
673sequencing needs to wait until checkpoint contexts contain a commit LSN
674(obtained through completion of a commit record write) while log force
675sequencing needs to wait until previous checkpoint contexts are removed from
676the committing list (i.e. they've completed). A simple wait variable and
677broadcast wakeups (thundering herds) has been used to implement these two
678serialisation queues. They use the same lock as the CIL, too. If we see too
679much contention on the CIL lock, or too many context switches as a result of
680the broadcast wakeups these operations can be put under a new spinlock and
681given separate wait lists to reduce lock contention and the number of processes
682woken by the wrong event.
683
684
685Lifecycle Changes
686
687The existing log item life cycle is as follows:
688
689 1. Transaction allocate
690 2. Transaction reserve
691 3. Lock item
692 4. Join item to transaction
693 If not already attached,
694 Allocate log item
695 Attach log item to owner item
696 Attach log item to transaction
697 5. Modify item
698 Record modifications in log item
699 6. Transaction commit
700 Pin item in memory
701 Format item into log buffer
702 Write commit LSN into transaction
703 Unlock item
704 Attach transaction to log buffer
705
706 <log buffer IO dispatched>
707 <log buffer IO completes>
708
709 7. Transaction completion
710 Mark log item committed
711 Insert log item into AIL
712 Write commit LSN into log item
713 Unpin log item
714 8. AIL traversal
715 Lock item
716 Mark log item clean
717 Flush item to disk
718
719 <item IO completion>
720
721 9. Log item removed from AIL
722 Moves log tail
723 Item unlocked
724
725Essentially, steps 1-6 operate independently from step 7, which is also
726independent of steps 8-9. An item can be locked in steps 1-6 or steps 8-9
727at the same time step 7 is occurring, but only steps 1-6 or 8-9 can occur
728at the same time. If the log item is in the AIL or between steps 6 and 7
729and steps 1-6 are re-entered, then the item is relogged. Only when steps 8-9
730are entered and completed is the object considered clean.
731
732With delayed logging, there are new steps inserted into the life cycle:
733
734 1. Transaction allocate
735 2. Transaction reserve
736 3. Lock item
737 4. Join item to transaction
738 If not already attached,
739 Allocate log item
740 Attach log item to owner item
741 Attach log item to transaction
742 5. Modify item
743 Record modifications in log item
744 6. Transaction commit
745 Pin item in memory if not pinned in CIL
746 Format item into log vector + buffer
747 Attach log vector and buffer to log item
748 Insert log item into CIL
749 Write CIL context sequence into transaction
750 Unlock item
751
752 <next log force>
753
754 7. CIL push
755 lock CIL flush
756 Chain log vectors and buffers together
757 Remove items from CIL
758 unlock CIL flush
759 write log vectors into log
760 sequence commit records
761 attach checkpoint context to log buffer
762
763 <log buffer IO dispatched>
764 <log buffer IO completes>
765
766 8. Checkpoint completion
767 Mark log item committed
768 Insert item into AIL
769 Write commit LSN into log item
770 Unpin log item
771 9. AIL traversal
772 Lock item
773 Mark log item clean
774 Flush item to disk
775 <item IO completion>
776 10. Log item removed from AIL
777 Moves log tail
778 Item unlocked
779
780From this, it can be seen that the only life cycle differences between the two
781logging methods are in the middle of the life cycle - they still have the same
782beginning and end and execution constraints. The only differences are in the
783commiting of the log items to the log itself and the completion processing.
784Hence delayed logging should not introduce any constraints on log item
785behaviour, allocation or freeing that don't already exist.
786
787As a result of this zero-impact "insertion" of delayed logging infrastructure
788and the design of the internal structures to avoid on disk format changes, we
789can basically switch between delayed logging and the existing mechanism with a
790mount option. Fundamentally, there is no reason why the log manager would not
791be able to swap methods automatically and transparently depending on load
792characteristics, but this should not be necessary if delayed logging works as
793designed.
794
795Roadmap:
796
7972.6.35 Inclusion in mainline as an experimental mount option
798 => approximately 2-3 months to merge window
799 => needs to be in xfs-dev tree in 4-6 weeks
800 => code is nearing readiness for review
801
8022.6.37 Remove experimental tag from mount option
803 => should be roughly 6 months after initial merge
804 => enough time to:
805 => gain confidence and fix problems reported by early
806 adopters (a.k.a. guinea pigs)
807 => address worst performance regressions and undesired
808 behaviours
809 => start tuning/optimising code for parallelism
810 => start tuning/optimising algorithms consuming
811 excessive CPU time
812
8132.6.39 Switch default mount option to use delayed logging
814 => should be roughly 12 months after initial merge
815 => enough time to shake out remaining problems before next round of
816 enterprise distro kernel rebases
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index f5fce483930c..b56ea860da21 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -290,9 +290,6 @@ and is between 256 and 4096 characters. It is defined in the file
290 advansys= [HW,SCSI] 290 advansys= [HW,SCSI]
291 See header of drivers/scsi/advansys.c. 291 See header of drivers/scsi/advansys.c.
292 292
293 advwdt= [HW,WDT] Advantech WDT
294 Format: <iostart>,<iostop>
295
296 aedsp16= [HW,OSS] Audio Excel DSP 16 293 aedsp16= [HW,OSS] Audio Excel DSP 16
297 Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq> 294 Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq>
298 See also header of sound/oss/aedsp16.c. 295 See also header of sound/oss/aedsp16.c.
@@ -765,9 +762,6 @@ and is between 256 and 4096 characters. It is defined in the file
765 This option is obsoleted by the "netdev=" option, which 762 This option is obsoleted by the "netdev=" option, which
766 has equivalent usage. See its documentation for details. 763 has equivalent usage. See its documentation for details.
767 764
768 eurwdt= [HW,WDT] Eurotech CPU-1220/1410 onboard watchdog.
769 Format: <io>[,<irq>]
770
771 failslab= 765 failslab=
772 fail_page_alloc= 766 fail_page_alloc=
773 fail_make_request=[KNL] 767 fail_make_request=[KNL]
@@ -2267,9 +2261,6 @@ and is between 256 and 4096 characters. It is defined in the file
2267 2261
2268 sched_debug [KNL] Enables verbose scheduler debug messages. 2262 sched_debug [KNL] Enables verbose scheduler debug messages.
2269 2263
2270 sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver
2271 Format: <io>[,<timeout>[,<isapnp>]]
2272
2273 scsi_debug_*= [SCSI] 2264 scsi_debug_*= [SCSI]
2274 See drivers/scsi/scsi_debug.c. 2265 See drivers/scsi/scsi_debug.c.
2275 2266
@@ -2858,8 +2849,10 @@ and is between 256 and 4096 characters. It is defined in the file
2858 wd7000= [HW,SCSI] 2849 wd7000= [HW,SCSI]
2859 See header of drivers/scsi/wd7000.c. 2850 See header of drivers/scsi/wd7000.c.
2860 2851
2861 wdt= [WDT] Watchdog 2852 watchdog timers [HW,WDT] For information on watchdog timers,
2862 See Documentation/watchdog/wdt.txt. 2853 see Documentation/watchdog/watchdog-parameters.txt
2854 or other driver-specific files in the
2855 Documentation/watchdog/ directory.
2863 2856
2864 x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of 2857 x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
2865 default x2apic cluster mode on platforms 2858 default x2apic cluster mode on platforms
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 6c7d18c53f84..5fdbb612aeb8 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -19,6 +19,7 @@ files can be found in mm/swap.c.
19Currently, these files are in /proc/sys/vm: 19Currently, these files are in /proc/sys/vm:
20 20
21- block_dump 21- block_dump
22- compact_memory
22- dirty_background_bytes 23- dirty_background_bytes
23- dirty_background_ratio 24- dirty_background_ratio
24- dirty_bytes 25- dirty_bytes
@@ -26,6 +27,7 @@ Currently, these files are in /proc/sys/vm:
26- dirty_ratio 27- dirty_ratio
27- dirty_writeback_centisecs 28- dirty_writeback_centisecs
28- drop_caches 29- drop_caches
30- extfrag_threshold
29- hugepages_treat_as_movable 31- hugepages_treat_as_movable
30- hugetlb_shm_group 32- hugetlb_shm_group
31- laptop_mode 33- laptop_mode
@@ -64,6 +66,15 @@ information on block I/O debugging is in Documentation/laptops/laptop-mode.txt.
64 66
65============================================================== 67==============================================================
66 68
69compact_memory
70
71Available only when CONFIG_COMPACTION is set. When 1 is written to the file,
72all zones are compacted such that free memory is available in contiguous
73blocks where possible. This can be important for example in the allocation of
74huge pages although processes will also directly compact memory as required.
75
76==============================================================
77
67dirty_background_bytes 78dirty_background_bytes
68 79
69Contains the amount of dirty memory at which the pdflush background writeback 80Contains the amount of dirty memory at which the pdflush background writeback
@@ -139,6 +150,20 @@ user should run `sync' first.
139 150
140============================================================== 151==============================================================
141 152
153extfrag_threshold
154
155This parameter affects whether the kernel will compact memory or direct
156reclaim to satisfy a high-order allocation. /proc/extfrag_index shows what
157the fragmentation index for each order is in each zone in the system. Values
158tending towards 0 imply allocations would fail due to lack of memory,
159values towards 1000 imply failures are due to fragmentation and -1 implies
160that the allocation will succeed as long as watermarks are met.
161
162The kernel will not compact memory in a zone if the
163fragmentation index is <= extfrag_threshold. The default value is 500.
164
165==============================================================
166
142hugepages_treat_as_movable 167hugepages_treat_as_movable
143 168
144This parameter is only useful when kernelcore= is specified at boot time to 169This parameter is only useful when kernelcore= is specified at boot time to
diff --git a/Documentation/watchdog/00-INDEX b/Documentation/watchdog/00-INDEX
index c3ea47e507fe..ee994513a9b1 100644
--- a/Documentation/watchdog/00-INDEX
+++ b/Documentation/watchdog/00-INDEX
@@ -1,10 +1,15 @@
100-INDEX 100-INDEX
2 - this file. 2 - this file.
3hpwdt.txt
4 - information on the HP iLO2 NMI watchdog
3pcwd-watchdog.txt 5pcwd-watchdog.txt
4 - documentation for Berkshire Products PC Watchdog ISA cards. 6 - documentation for Berkshire Products PC Watchdog ISA cards.
5src/ 7src/
6 - directory holding watchdog related example programs. 8 - directory holding watchdog related example programs.
7watchdog-api.txt 9watchdog-api.txt
8 - description of the Linux Watchdog driver API. 10 - description of the Linux Watchdog driver API.
11watchdog-parameters.txt
12 - information on driver parameters (for drivers other than
13 the ones that have driver-specific files here)
9wdt.txt 14wdt.txt
10 - description of the Watchdog Timer Interfaces for Linux. 15 - description of the Watchdog Timer Interfaces for Linux.
diff --git a/Documentation/watchdog/watchdog-parameters.txt b/Documentation/watchdog/watchdog-parameters.txt
new file mode 100644
index 000000000000..41c95cc1dc1f
--- /dev/null
+++ b/Documentation/watchdog/watchdog-parameters.txt
@@ -0,0 +1,390 @@
1This file provides information on the module parameters of many of
2the Linux watchdog drivers. Watchdog driver parameter specs should
3be listed here unless the driver has its own driver-specific information
4file.
5
6
7See Documentation/kernel-parameters.txt for information on
8providing kernel parameters for builtin drivers versus loadable
9modules.
10
11
12-------------------------------------------------
13acquirewdt:
14wdt_stop: Acquire WDT 'stop' io port (default 0x43)
15wdt_start: Acquire WDT 'start' io port (default 0x443)
16nowayout: Watchdog cannot be stopped once started
17 (default=kernel config parameter)
18-------------------------------------------------
19advantechwdt:
20wdt_stop: Advantech WDT 'stop' io port (default 0x443)
21wdt_start: Advantech WDT 'start' io port (default 0x443)
22timeout: Watchdog timeout in seconds. 1<= timeout <=63, default=60.
23nowayout: Watchdog cannot be stopped once started
24 (default=kernel config parameter)
25-------------------------------------------------
26alim1535_wdt:
27timeout: Watchdog timeout in seconds. (0 < timeout < 18000, default=60
28nowayout: Watchdog cannot be stopped once started
29 (default=kernel config parameter)
30-------------------------------------------------
31alim7101_wdt:
32timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=30
33use_gpio: Use the gpio watchdog (required by old cobalt boards).
34 default=0/off/no
35nowayout: Watchdog cannot be stopped once started
36 (default=kernel config parameter)
37-------------------------------------------------
38ar7_wdt:
39margin: Watchdog margin in seconds (default=60)
40nowayout: Disable watchdog shutdown on close
41 (default=kernel config parameter)
42-------------------------------------------------
43at32ap700x_wdt:
44timeout: Timeout value. Limited to be 1 or 2 seconds. (default=2)
45nowayout: Watchdog cannot be stopped once started
46 (default=kernel config parameter)
47-------------------------------------------------
48at91rm9200_wdt:
49wdt_time: Watchdog time in seconds. (default=5)
50nowayout: Watchdog cannot be stopped once started
51 (default=kernel config parameter)
52-------------------------------------------------
53at91sam9_wdt:
54heartbeat: Watchdog heartbeats in seconds. (default = 15)
55nowayout: Watchdog cannot be stopped once started
56 (default=kernel config parameter)
57-------------------------------------------------
58bcm47xx_wdt:
59wdt_time: Watchdog time in seconds. (default=30)
60nowayout: Watchdog cannot be stopped once started
61 (default=kernel config parameter)
62-------------------------------------------------
63bfin_wdt:
64timeout: Watchdog timeout in seconds. (1<=timeout<=((2^32)/SCLK), default=20)
65nowayout: Watchdog cannot be stopped once started
66 (default=kernel config parameter)
67-------------------------------------------------
68coh901327_wdt:
69margin: Watchdog margin in seconds (default 60s)
70-------------------------------------------------
71cpu5wdt:
72port: base address of watchdog card, default is 0x91
73verbose: be verbose, default is 0 (no)
74ticks: count down ticks, default is 10000
75-------------------------------------------------
76cpwd:
77wd0_timeout: Default watchdog0 timeout in 1/10secs
78wd1_timeout: Default watchdog1 timeout in 1/10secs
79wd2_timeout: Default watchdog2 timeout in 1/10secs
80-------------------------------------------------
81davinci_wdt:
82heartbeat: Watchdog heartbeat period in seconds from 1 to 600, default 60
83-------------------------------------------------
84ep93xx_wdt:
85nowayout: Watchdog cannot be stopped once started
86timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=TBD)
87-------------------------------------------------
88eurotechwdt:
89nowayout: Watchdog cannot be stopped once started
90 (default=kernel config parameter)
91io: Eurotech WDT io port (default=0x3f0)
92irq: Eurotech WDT irq (default=10)
93ev: Eurotech WDT event type (default is `int')
94-------------------------------------------------
95gef_wdt:
96nowayout: Watchdog cannot be stopped once started
97 (default=kernel config parameter)
98-------------------------------------------------
99geodewdt:
100timeout: Watchdog timeout in seconds. 1<= timeout <=131, default=60.
101nowayout: Watchdog cannot be stopped once started
102 (default=kernel config parameter)
103-------------------------------------------------
104i6300esb:
105heartbeat: Watchdog heartbeat in seconds. (1<heartbeat<2046, default=30)
106nowayout: Watchdog cannot be stopped once started
107 (default=kernel config parameter)
108-------------------------------------------------
109iTCO_wdt:
110heartbeat: Watchdog heartbeat in seconds.
111 (2<heartbeat<39 (TCO v1) or 613 (TCO v2), default=30)
112nowayout: Watchdog cannot be stopped once started
113 (default=kernel config parameter)
114-------------------------------------------------
115iTCO_vendor_support:
116vendorsupport: iTCO vendor specific support mode, default=0 (none),
117 1=SuperMicro Pent3, 2=SuperMicro Pent4+, 911=Broken SMI BIOS
118-------------------------------------------------
119ib700wdt:
120timeout: Watchdog timeout in seconds. 0<= timeout <=30, default=30.
121nowayout: Watchdog cannot be stopped once started
122 (default=kernel config parameter)
123-------------------------------------------------
124ibmasr:
125nowayout: Watchdog cannot be stopped once started
126 (default=kernel config parameter)
127-------------------------------------------------
128indydog:
129nowayout: Watchdog cannot be stopped once started
130 (default=kernel config parameter)
131-------------------------------------------------
132iop_wdt:
133nowayout: Watchdog cannot be stopped once started
134 (default=kernel config parameter)
135-------------------------------------------------
136it8712f_wdt:
137margin: Watchdog margin in seconds (default 60)
138nowayout: Disable watchdog shutdown on close
139 (default=kernel config parameter)
140-------------------------------------------------
141it87_wdt:
142nogameport: Forbid the activation of game port, default=0
143exclusive: Watchdog exclusive device open, default=1
144timeout: Watchdog timeout in seconds, default=60
145testmode: Watchdog test mode (1 = no reboot), default=0
146nowayout: Watchdog cannot be stopped once started
147 (default=kernel config parameter)
148-------------------------------------------------
149ixp2000_wdt:
150heartbeat: Watchdog heartbeat in seconds (default 60s)
151nowayout: Watchdog cannot be stopped once started
152 (default=kernel config parameter)
153-------------------------------------------------
154ixp4xx_wdt:
155heartbeat: Watchdog heartbeat in seconds (default 60s)
156nowayout: Watchdog cannot be stopped once started
157 (default=kernel config parameter)
158-------------------------------------------------
159ks8695_wdt:
160wdt_time: Watchdog time in seconds. (default=5)
161nowayout: Watchdog cannot be stopped once started
162 (default=kernel config parameter)
163-------------------------------------------------
164machzwd:
165nowayout: Watchdog cannot be stopped once started
166 (default=kernel config parameter)
167action: after watchdog resets, generate:
168 0 = RESET(*) 1 = SMI 2 = NMI 3 = SCI
169-------------------------------------------------
170max63xx_wdt:
171heartbeat: Watchdog heartbeat period in seconds from 1 to 60, default 60
172nowayout: Watchdog cannot be stopped once started
173 (default=kernel config parameter)
174nodelay: Force selection of a timeout setting without initial delay
175 (max6373/74 only, default=0)
176-------------------------------------------------
177mixcomwd:
178nowayout: Watchdog cannot be stopped once started
179 (default=kernel config parameter)
180-------------------------------------------------
181mpc8xxx_wdt:
182timeout: Watchdog timeout in ticks. (0<timeout<65536, default=65535)
183reset: Watchdog Interrupt/Reset Mode. 0 = interrupt, 1 = reset
184nowayout: Watchdog cannot be stopped once started
185 (default=kernel config parameter)
186-------------------------------------------------
187mpcore_wdt:
188mpcore_margin: MPcore timer margin in seconds.
189 (0 < mpcore_margin < 65536, default=60)
190nowayout: Watchdog cannot be stopped once started
191 (default=kernel config parameter)
192mpcore_noboot: MPcore watchdog action, set to 1 to ignore reboots,
193 0 to reboot (default=0
194-------------------------------------------------
195mv64x60_wdt:
196nowayout: Watchdog cannot be stopped once started
197 (default=kernel config parameter)
198-------------------------------------------------
199nuc900_wdt:
200heartbeat: Watchdog heartbeats in seconds.
201 (default = 15)
202nowayout: Watchdog cannot be stopped once started
203 (default=kernel config parameter)
204-------------------------------------------------
205omap_wdt:
206timer_margin: initial watchdog timeout (in seconds)
207-------------------------------------------------
208orion_wdt:
209heartbeat: Initial watchdog heartbeat in seconds
210nowayout: Watchdog cannot be stopped once started
211 (default=kernel config parameter)
212-------------------------------------------------
213pc87413_wdt:
214io: pc87413 WDT I/O port (default: io).
215timeout: Watchdog timeout in minutes (default=timeout).
216nowayout: Watchdog cannot be stopped once started
217 (default=kernel config parameter)
218-------------------------------------------------
219pika_wdt:
220heartbeat: Watchdog heartbeats in seconds. (default = 15)
221nowayout: Watchdog cannot be stopped once started
222 (default=kernel config parameter)
223-------------------------------------------------
224pnx4008_wdt:
225heartbeat: Watchdog heartbeat period in seconds from 1 to 60, default 19
226nowayout: Set to 1 to keep watchdog running after device release
227-------------------------------------------------
228pnx833x_wdt:
229timeout: Watchdog timeout in Mhz. (68Mhz clock), default=2040000000 (30 seconds)
230nowayout: Watchdog cannot be stopped once started
231 (default=kernel config parameter)
232start_enabled: Watchdog is started on module insertion (default=1)
233-------------------------------------------------
234rc32434_wdt:
235timeout: Watchdog timeout value, in seconds (default=20)
236nowayout: Watchdog cannot be stopped once started
237 (default=kernel config parameter)
238-------------------------------------------------
239riowd:
240riowd_timeout: Watchdog timeout in minutes (default=1)
241-------------------------------------------------
242s3c2410_wdt:
243tmr_margin: Watchdog tmr_margin in seconds. (default=15)
244tmr_atboot: Watchdog is started at boot time if set to 1, default=0
245nowayout: Watchdog cannot be stopped once started
246 (default=kernel config parameter)
247soft_noboot: Watchdog action, set to 1 to ignore reboots, 0 to reboot
248debug: Watchdog debug, set to >1 for debug, (default 0)
249-------------------------------------------------
250sa1100_wdt:
251margin: Watchdog margin in seconds (default 60s)
252-------------------------------------------------
253sb_wdog:
254timeout: Watchdog timeout in microseconds (max/default 8388607 or 8.3ish secs)
255-------------------------------------------------
256sbc60xxwdt:
257wdt_stop: SBC60xx WDT 'stop' io port (default 0x45)
258wdt_start: SBC60xx WDT 'start' io port (default 0x443)
259timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=30)
260nowayout: Watchdog cannot be stopped once started
261 (default=kernel config parameter)
262-------------------------------------------------
263sbc7240_wdt:
264timeout: Watchdog timeout in seconds. (1<=timeout<=255, default=30)
265nowayout: Disable watchdog when closing device file
266-------------------------------------------------
267sbc8360:
268timeout: Index into timeout table (0-63) (default=27 (60s))
269nowayout: Watchdog cannot be stopped once started
270 (default=kernel config parameter)
271-------------------------------------------------
272sbc_epx_c3:
273nowayout: Watchdog cannot be stopped once started
274 (default=kernel config parameter)
275-------------------------------------------------
276sbc_fitpc2_wdt:
277margin: Watchdog margin in seconds (default 60s)
278nowayout: Watchdog cannot be stopped once started
279-------------------------------------------------
280sc1200wdt:
281isapnp: When set to 0 driver ISA PnP support will be disabled (default=1)
282io: io port
283timeout: range is 0-255 minutes, default is 1
284nowayout: Watchdog cannot be stopped once started
285 (default=kernel config parameter)
286-------------------------------------------------
287sc520_wdt:
288timeout: Watchdog timeout in seconds. (1 <= timeout <= 3600, default=30)
289nowayout: Watchdog cannot be stopped once started
290 (default=kernel config parameter)
291-------------------------------------------------
292sch311x_wdt:
293force_id: Override the detected device ID
294therm_trip: Should a ThermTrip trigger the reset generator
295timeout: Watchdog timeout in seconds. 1<= timeout <=15300, default=60
296nowayout: Watchdog cannot be stopped once started
297 (default=kernel config parameter)
298-------------------------------------------------
299scx200_wdt:
300margin: Watchdog margin in seconds
301nowayout: Disable watchdog shutdown on close
302-------------------------------------------------
303shwdt:
304clock_division_ratio: Clock division ratio. Valid ranges are from 0x5 (1.31ms)
305 to 0x7 (5.25ms). (default=7)
306heartbeat: Watchdog heartbeat in seconds. (1 <= heartbeat <= 3600, default=30
307nowayout: Watchdog cannot be stopped once started
308 (default=kernel config parameter)
309-------------------------------------------------
310smsc37b787_wdt:
311timeout: range is 1-255 units, default is 60
312nowayout: Watchdog cannot be stopped once started
313 (default=kernel config parameter)
314-------------------------------------------------
315softdog:
316soft_margin: Watchdog soft_margin in seconds.
317 (0 < soft_margin < 65536, default=60)
318nowayout: Watchdog cannot be stopped once started
319 (default=kernel config parameter)
320soft_noboot: Softdog action, set to 1 to ignore reboots, 0 to reboot
321 (default=0)
322-------------------------------------------------
323stmp3xxx_wdt:
324heartbeat: Watchdog heartbeat period in seconds from 1 to 4194304, default 19
325-------------------------------------------------
326ts72xx_wdt:
327timeout: Watchdog timeout in seconds. (1 <= timeout <= 8, default=8)
328nowayout: Disable watchdog shutdown on close
329-------------------------------------------------
330twl4030_wdt:
331nowayout: Watchdog cannot be stopped once started
332 (default=kernel config parameter)
333-------------------------------------------------
334txx9wdt:
335timeout: Watchdog timeout in seconds. (0<timeout<N, default=60)
336nowayout: Watchdog cannot be stopped once started
337 (default=kernel config parameter)
338-------------------------------------------------
339w83627hf_wdt:
340wdt_io: w83627hf/thf WDT io port (default 0x2E)
341timeout: Watchdog timeout in seconds. 1 <= timeout <= 255, default=60.
342nowayout: Watchdog cannot be stopped once started
343 (default=kernel config parameter)
344-------------------------------------------------
345w83697hf_wdt:
346wdt_io: w83697hf/hg WDT io port (default 0x2e, 0 = autodetect)
347timeout: Watchdog timeout in seconds. 1<= timeout <=255 (default=60)
348nowayout: Watchdog cannot be stopped once started
349 (default=kernel config parameter)
350early_disable: Watchdog gets disabled at boot time (default=1)
351-------------------------------------------------
352w83697ug_wdt:
353wdt_io: w83697ug/uf WDT io port (default 0x2e)
354timeout: Watchdog timeout in seconds. 1<= timeout <=255 (default=60)
355nowayout: Watchdog cannot be stopped once started
356 (default=kernel config parameter)
357-------------------------------------------------
358w83877f_wdt:
359timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=30)
360nowayout: Watchdog cannot be stopped once started
361 (default=kernel config parameter)
362-------------------------------------------------
363w83977f_wdt:
364timeout: Watchdog timeout in seconds (15..7635), default=45)
365testmode: Watchdog testmode (1 = no reboot), default=0
366nowayout: Watchdog cannot be stopped once started
367 (default=kernel config parameter)
368-------------------------------------------------
369wafer5823wdt:
370timeout: Watchdog timeout in seconds. 1 <= timeout <= 255, default=60.
371nowayout: Watchdog cannot be stopped once started
372 (default=kernel config parameter)
373-------------------------------------------------
374wdt285:
375soft_margin: Watchdog timeout in seconds (default=60)
376-------------------------------------------------
377wdt977:
378timeout: Watchdog timeout in seconds (60..15300, default=60)
379testmode: Watchdog testmode (1 = no reboot), default=0
380nowayout: Watchdog cannot be stopped once started
381 (default=kernel config parameter)
382-------------------------------------------------
383wm831x_wdt:
384nowayout: Watchdog cannot be stopped once started
385 (default=kernel config parameter)
386-------------------------------------------------
387wm8350_wdt:
388nowayout: Watchdog cannot be stopped once started
389 (default=kernel config parameter)
390-------------------------------------------------
diff --git a/Documentation/watchdog/wdt.txt b/Documentation/watchdog/wdt.txt
index 03fd756d976d..061c2e35384f 100644
--- a/Documentation/watchdog/wdt.txt
+++ b/Documentation/watchdog/wdt.txt
@@ -14,14 +14,22 @@ reboot will depend on the state of the machines and interrupts. The hardware
14boards physically pull the machine down off their own onboard timers and 14boards physically pull the machine down off their own onboard timers and
15will reboot from almost anything. 15will reboot from almost anything.
16 16
17A second temperature monitoring interface is available on the WDT501P cards 17A second temperature monitoring interface is available on the WDT501P cards.
18This provides /dev/temperature. This is the machine internal temperature in 18This provides /dev/temperature. This is the machine internal temperature in
19degrees Fahrenheit. Each read returns a single byte giving the temperature. 19degrees Fahrenheit. Each read returns a single byte giving the temperature.
20 20
21The third interface logs kernel messages on additional alert events. 21The third interface logs kernel messages on additional alert events.
22 22
23The wdt card cannot be safely probed for. Instead you need to pass 23The ICS ISA-bus wdt card cannot be safely probed for. Instead you need to
24wdt=ioaddr,irq as a boot parameter - eg "wdt=0x240,11". 24pass IO address and IRQ boot parameters. E.g.:
25 wdt.io=0x240 wdt.irq=11
26
27Other "wdt" driver parameters are:
28 heartbeat Watchdog heartbeat in seconds (default 60)
29 nowayout Watchdog cannot be stopped once started (kernel
30 build parameter)
31 tachometer WDT501-P Fan Tachometer support (0=disable, default=0)
32 type WDT501-P Card type (500 or 501, default=500)
25 33
26Features 34Features
27-------- 35--------
@@ -40,4 +48,3 @@ Minor numbers are however allocated for it.
40 48
41 49
42Example Watchdog Driver: see Documentation/watchdog/src/watchdog-simple.c 50Example Watchdog Driver: see Documentation/watchdog/src/watchdog-simple.c
43
diff --git a/arch/alpha/math-emu/sfp-util.h b/arch/alpha/math-emu/sfp-util.h
index f53707f77455..d4c6ae7fee47 100644
--- a/arch/alpha/math-emu/sfp-util.h
+++ b/arch/alpha/math-emu/sfp-util.h
@@ -28,8 +28,3 @@ extern unsigned long __udiv_qrnnd (unsigned long *, unsigned long,
28#define UDIV_NEEDS_NORMALIZATION 1 28#define UDIV_NEEDS_NORMALIZATION 1
29 29
30#define abort() goto bad_insn 30#define abort() goto bad_insn
31
32#ifndef __LITTLE_ENDIAN
33#define __LITTLE_ENDIAN -1
34#endif
35#define __BYTE_ORDER __LITTLE_ENDIAN
diff --git a/arch/arm/plat-samsung/include/plat/regs-rtc.h b/arch/arm/plat-samsung/include/plat/regs-rtc.h
index d5837cf8e402..65c190d142dd 100644
--- a/arch/arm/plat-samsung/include/plat/regs-rtc.h
+++ b/arch/arm/plat-samsung/include/plat/regs-rtc.h
@@ -20,6 +20,10 @@
20#define S3C2410_RTCCON_CLKSEL (1<<1) 20#define S3C2410_RTCCON_CLKSEL (1<<1)
21#define S3C2410_RTCCON_CNTSEL (1<<2) 21#define S3C2410_RTCCON_CNTSEL (1<<2)
22#define S3C2410_RTCCON_CLKRST (1<<3) 22#define S3C2410_RTCCON_CLKRST (1<<3)
23#define S3C64XX_RTCCON_TICEN (1<<8)
24
25#define S3C64XX_RTCCON_TICMSK (0xF<<7)
26#define S3C64XX_RTCCON_TICSHT (7)
23 27
24#define S3C2410_TICNT S3C2410_RTCREG(0x44) 28#define S3C2410_TICNT S3C2410_RTCREG(0x44)
25#define S3C2410_TICNT_ENABLE (1<<7) 29#define S3C2410_TICNT_ENABLE (1<<7)
diff --git a/arch/frv/include/asm/cache.h b/arch/frv/include/asm/cache.h
index 2797163b8f4f..7dc0f0f85b7c 100644
--- a/arch/frv/include/asm/cache.h
+++ b/arch/frv/include/asm/cache.h
@@ -17,6 +17,8 @@
17#define L1_CACHE_SHIFT (CONFIG_FRV_L1_CACHE_SHIFT) 17#define L1_CACHE_SHIFT (CONFIG_FRV_L1_CACHE_SHIFT)
18#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) 18#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
19 19
20#define ARCH_KMALLOC_MINALIGN L1_CACHE_BYTES
21
20#define __cacheline_aligned __attribute__((aligned(L1_CACHE_BYTES))) 22#define __cacheline_aligned __attribute__((aligned(L1_CACHE_BYTES)))
21#define ____cacheline_aligned __attribute__((aligned(L1_CACHE_BYTES))) 23#define ____cacheline_aligned __attribute__((aligned(L1_CACHE_BYTES)))
22 24
diff --git a/arch/frv/include/asm/gdb-stub.h b/arch/frv/include/asm/gdb-stub.h
index 2da716407ff2..e6bedd0cd9a5 100644
--- a/arch/frv/include/asm/gdb-stub.h
+++ b/arch/frv/include/asm/gdb-stub.h
@@ -12,6 +12,7 @@
12#ifndef __ASM_GDB_STUB_H 12#ifndef __ASM_GDB_STUB_H
13#define __ASM_GDB_STUB_H 13#define __ASM_GDB_STUB_H
14 14
15#undef GDBSTUB_DEBUG_IO
15#undef GDBSTUB_DEBUG_PROTOCOL 16#undef GDBSTUB_DEBUG_PROTOCOL
16 17
17#include <asm/ptrace.h> 18#include <asm/ptrace.h>
@@ -108,6 +109,12 @@ extern void gdbstub_printk(const char *fmt, ...);
108extern void debug_to_serial(const char *p, int n); 109extern void debug_to_serial(const char *p, int n);
109extern void console_set_baud(unsigned baud); 110extern void console_set_baud(unsigned baud);
110 111
112#ifdef GDBSTUB_DEBUG_IO
113#define gdbstub_io(FMT,...) gdbstub_printk(FMT, ##__VA_ARGS__)
114#else
115#define gdbstub_io(FMT,...) ({ 0; })
116#endif
117
111#ifdef GDBSTUB_DEBUG_PROTOCOL 118#ifdef GDBSTUB_DEBUG_PROTOCOL
112#define gdbstub_proto(FMT,...) gdbstub_printk(FMT,##__VA_ARGS__) 119#define gdbstub_proto(FMT,...) gdbstub_printk(FMT,##__VA_ARGS__)
113#else 120#else
diff --git a/arch/frv/kernel/gdb-io.c b/arch/frv/kernel/gdb-io.c
index c997bccb9221..2ca641d199f8 100644
--- a/arch/frv/kernel/gdb-io.c
+++ b/arch/frv/kernel/gdb-io.c
@@ -171,11 +171,11 @@ int gdbstub_rx_char(unsigned char *_ch, int nonblock)
171 return -EINTR; 171 return -EINTR;
172 } 172 }
173 else if (st & (UART_LSR_FE|UART_LSR_OE|UART_LSR_PE)) { 173 else if (st & (UART_LSR_FE|UART_LSR_OE|UART_LSR_PE)) {
174 gdbstub_proto("### GDB Rx Error (st=%02x) ###\n",st); 174 gdbstub_io("### GDB Rx Error (st=%02x) ###\n",st);
175 return -EIO; 175 return -EIO;
176 } 176 }
177 else { 177 else {
178 gdbstub_proto("### GDB Rx %02x (st=%02x) ###\n",ch,st); 178 gdbstub_io("### GDB Rx %02x (st=%02x) ###\n",ch,st);
179 *_ch = ch & 0x7f; 179 *_ch = ch & 0x7f;
180 return 0; 180 return 0;
181 } 181 }
diff --git a/arch/frv/kernel/gdb-stub.c b/arch/frv/kernel/gdb-stub.c
index 7ca8a6b19ac9..84d103c33c9c 100644
--- a/arch/frv/kernel/gdb-stub.c
+++ b/arch/frv/kernel/gdb-stub.c
@@ -1344,6 +1344,44 @@ void gdbstub_get_mmu_state(void)
1344 1344
1345} /* end gdbstub_get_mmu_state() */ 1345} /* end gdbstub_get_mmu_state() */
1346 1346
1347/*
1348 * handle general query commands of the form 'qXXXXX'
1349 */
1350static void gdbstub_handle_query(void)
1351{
1352 if (strcmp(input_buffer, "qAttached") == 0) {
1353 /* return current thread ID */
1354 sprintf(output_buffer, "1");
1355 return;
1356 }
1357
1358 if (strcmp(input_buffer, "qC") == 0) {
1359 /* return current thread ID */
1360 sprintf(output_buffer, "QC 0");
1361 return;
1362 }
1363
1364 if (strcmp(input_buffer, "qOffsets") == 0) {
1365 /* return relocation offset of text and data segments */
1366 sprintf(output_buffer, "Text=0;Data=0;Bss=0");
1367 return;
1368 }
1369
1370 if (strcmp(input_buffer, "qSymbol::") == 0) {
1371 sprintf(output_buffer, "OK");
1372 return;
1373 }
1374
1375 if (strcmp(input_buffer, "qSupported") == 0) {
1376 /* query of supported features */
1377 sprintf(output_buffer, "PacketSize=%u;ReverseContinue-;ReverseStep-",
1378 sizeof(input_buffer));
1379 return;
1380 }
1381
1382 gdbstub_strcpy(output_buffer,"E01");
1383}
1384
1347/*****************************************************************************/ 1385/*****************************************************************************/
1348/* 1386/*
1349 * handle event interception and GDB remote protocol processing 1387 * handle event interception and GDB remote protocol processing
@@ -1840,6 +1878,10 @@ void gdbstub(int sigval)
1840 case 'k' : 1878 case 'k' :
1841 goto done; /* just continue */ 1879 goto done; /* just continue */
1842 1880
1881 /* detach */
1882 case 'D':
1883 gdbstub_strcpy(output_buffer, "OK");
1884 break;
1843 1885
1844 /* reset the whole machine (FIXME: system dependent) */ 1886 /* reset the whole machine (FIXME: system dependent) */
1845 case 'r': 1887 case 'r':
@@ -1852,6 +1894,14 @@ void gdbstub(int sigval)
1852 __debug_status.dcr |= DCR_SE; 1894 __debug_status.dcr |= DCR_SE;
1853 goto done; 1895 goto done;
1854 1896
1897 /* extended command */
1898 case 'v':
1899 if (strcmp(input_buffer, "vCont?") == 0) {
1900 output_buffer[0] = 0;
1901 break;
1902 }
1903 goto unsupported_cmd;
1904
1855 /* set baud rate (bBB) */ 1905 /* set baud rate (bBB) */
1856 case 'b': 1906 case 'b':
1857 ptr = &input_buffer[1]; 1907 ptr = &input_buffer[1];
@@ -1923,8 +1973,19 @@ void gdbstub(int sigval)
1923 gdbstub_strcpy(output_buffer,"OK"); 1973 gdbstub_strcpy(output_buffer,"OK");
1924 break; 1974 break;
1925 1975
1976 /* Thread-setting packet */
1977 case 'H':
1978 gdbstub_strcpy(output_buffer, "OK");
1979 break;
1980
1981 case 'q':
1982 gdbstub_handle_query();
1983 break;
1984
1926 default: 1985 default:
1986 unsupported_cmd:
1927 gdbstub_proto("### GDB Unsupported Cmd '%s'\n",input_buffer); 1987 gdbstub_proto("### GDB Unsupported Cmd '%s'\n",input_buffer);
1988 gdbstub_strcpy(output_buffer,"E01");
1928 break; 1989 break;
1929 } 1990 }
1930 1991
diff --git a/arch/mn10300/include/asm/atomic.h b/arch/mn10300/include/asm/atomic.h
index e41222d6c2fd..f0cc1f84a72f 100644
--- a/arch/mn10300/include/asm/atomic.h
+++ b/arch/mn10300/include/asm/atomic.h
@@ -1,157 +1 @@
1/* MN10300 Atomic counter operations #include <asm-generic/atomic.h>
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11#ifndef _ASM_ATOMIC_H
12#define _ASM_ATOMIC_H
13
14#ifdef CONFIG_SMP
15#error not SMP safe
16#endif
17
18/*
19 * Atomic operations that C can't guarantee us. Useful for
20 * resource counting etc..
21 */
22
23#define ATOMIC_INIT(i) { (i) }
24
25#ifdef __KERNEL__
26
27/**
28 * atomic_read - read atomic variable
29 * @v: pointer of type atomic_t
30 *
31 * Atomically reads the value of @v. Note that the guaranteed
32 * useful range of an atomic_t is only 24 bits.
33 */
34#define atomic_read(v) (*(volatile int *)&(v)->counter)
35
36/**
37 * atomic_set - set atomic variable
38 * @v: pointer of type atomic_t
39 * @i: required value
40 *
41 * Atomically sets the value of @v to @i. Note that the guaranteed
42 * useful range of an atomic_t is only 24 bits.
43 */
44#define atomic_set(v, i) (((v)->counter) = (i))
45
46#include <asm/system.h>
47
48/**
49 * atomic_add_return - add integer to atomic variable
50 * @i: integer value to add
51 * @v: pointer of type atomic_t
52 *
53 * Atomically adds @i to @v and returns the result
54 * Note that the guaranteed useful range of an atomic_t is only 24 bits.
55 */
56static inline int atomic_add_return(int i, atomic_t *v)
57{
58 unsigned long flags;
59 int temp;
60
61 local_irq_save(flags);
62 temp = v->counter;
63 temp += i;
64 v->counter = temp;
65 local_irq_restore(flags);
66
67 return temp;
68}
69
70/**
71 * atomic_sub_return - subtract integer from atomic variable
72 * @i: integer value to subtract
73 * @v: pointer of type atomic_t
74 *
75 * Atomically subtracts @i from @v and returns the result
76 * Note that the guaranteed useful range of an atomic_t is only 24 bits.
77 */
78static inline int atomic_sub_return(int i, atomic_t *v)
79{
80 unsigned long flags;
81 int temp;
82
83 local_irq_save(flags);
84 temp = v->counter;
85 temp -= i;
86 v->counter = temp;
87 local_irq_restore(flags);
88
89 return temp;
90}
91
92static inline int atomic_add_negative(int i, atomic_t *v)
93{
94 return atomic_add_return(i, v) < 0;
95}
96
97static inline void atomic_add(int i, atomic_t *v)
98{
99 atomic_add_return(i, v);
100}
101
102static inline void atomic_sub(int i, atomic_t *v)
103{
104 atomic_sub_return(i, v);
105}
106
107static inline void atomic_inc(atomic_t *v)
108{
109 atomic_add_return(1, v);
110}
111
112static inline void atomic_dec(atomic_t *v)
113{
114 atomic_sub_return(1, v);
115}
116
117#define atomic_dec_return(v) atomic_sub_return(1, (v))
118#define atomic_inc_return(v) atomic_add_return(1, (v))
119
120#define atomic_sub_and_test(i, v) (atomic_sub_return((i), (v)) == 0)
121#define atomic_dec_and_test(v) (atomic_sub_return(1, (v)) == 0)
122#define atomic_inc_and_test(v) (atomic_add_return(1, (v)) == 0)
123
124#define atomic_add_unless(v, a, u) \
125({ \
126 int c, old; \
127 c = atomic_read(v); \
128 while (c != (u) && (old = atomic_cmpxchg((v), c, c + (a))) != c) \
129 c = old; \
130 c != (u); \
131})
132
133#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
134
135static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
136{
137 unsigned long flags;
138
139 mask = ~mask;
140 local_irq_save(flags);
141 *addr &= mask;
142 local_irq_restore(flags);
143}
144
145#define atomic_xchg(ptr, v) (xchg(&(ptr)->counter, (v)))
146#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new)))
147
148/* Atomic operations are already serializing on MN10300??? */
149#define smp_mb__before_atomic_dec() barrier()
150#define smp_mb__after_atomic_dec() barrier()
151#define smp_mb__before_atomic_inc() barrier()
152#define smp_mb__after_atomic_inc() barrier()
153
154#include <asm-generic/atomic-long.h>
155
156#endif /* __KERNEL__ */
157#endif /* _ASM_ATOMIC_H */
diff --git a/arch/mn10300/include/asm/cache.h b/arch/mn10300/include/asm/cache.h
index e03cfa2e997e..6e2fe28dde4e 100644
--- a/arch/mn10300/include/asm/cache.h
+++ b/arch/mn10300/include/asm/cache.h
@@ -21,6 +21,8 @@
21#define L1_CACHE_DISPARITY L1_CACHE_NENTRIES * L1_CACHE_BYTES 21#define L1_CACHE_DISPARITY L1_CACHE_NENTRIES * L1_CACHE_BYTES
22#endif 22#endif
23 23
24#define ARCH_KMALLOC_MINALIGN L1_CACHE_BYTES
25
24/* data cache purge registers 26/* data cache purge registers
25 * - read from the register to unconditionally purge that cache line 27 * - read from the register to unconditionally purge that cache line
26 * - write address & 0xffffff00 to conditionally purge that cache line 28 * - write address & 0xffffff00 to conditionally purge that cache line
diff --git a/arch/powerpc/include/asm/sfp-machine.h b/arch/powerpc/include/asm/sfp-machine.h
index 3a7a67a0d006..8b8fab91ad1e 100644
--- a/arch/powerpc/include/asm/sfp-machine.h
+++ b/arch/powerpc/include/asm/sfp-machine.h
@@ -353,12 +353,6 @@
353#define abort() \ 353#define abort() \
354 return 0 354 return 0
355 355
356#ifdef __BIG_ENDIAN
357#define __BYTE_ORDER __BIG_ENDIAN
358#else
359#define __BYTE_ORDER __LITTLE_ENDIAN
360#endif
361
362/* Exception flags. */ 356/* Exception flags. */
363#define EFLAG_INVALID (1 << (31 - 2)) 357#define EFLAG_INVALID (1 << (31 - 2))
364#define EFLAG_OVERFLOW (1 << (31 - 3)) 358#define EFLAG_OVERFLOW (1 << (31 - 3))
diff --git a/arch/s390/include/asm/sfp-util.h b/arch/s390/include/asm/sfp-util.h
index 0addc6466d95..7d43fee17e32 100644
--- a/arch/s390/include/asm/sfp-util.h
+++ b/arch/s390/include/asm/sfp-util.h
@@ -73,5 +73,3 @@ extern unsigned long __udiv_qrnnd (unsigned int *, unsigned int,
73#define UDIV_NEEDS_NORMALIZATION 0 73#define UDIV_NEEDS_NORMALIZATION 0
74 74
75#define abort() return 0 75#define abort() return 0
76
77#define __BYTE_ORDER __BIG_ENDIAN
diff --git a/arch/sh/math-emu/sfp-util.h b/arch/sh/math-emu/sfp-util.h
index 8ae1bd310ad0..e8526021892f 100644
--- a/arch/sh/math-emu/sfp-util.h
+++ b/arch/sh/math-emu/sfp-util.h
@@ -66,7 +66,3 @@
66 } while (0) 66 } while (0)
67 67
68#define abort() return 0 68#define abort() return 0
69
70#define __BYTE_ORDER __LITTLE_ENDIAN
71
72
diff --git a/arch/sparc/math-emu/sfp-util_32.h b/arch/sparc/math-emu/sfp-util_32.h
index d1b2aff3c259..0ea35afbb914 100644
--- a/arch/sparc/math-emu/sfp-util_32.h
+++ b/arch/sparc/math-emu/sfp-util_32.h
@@ -107,9 +107,3 @@
107 107
108#define abort() \ 108#define abort() \
109 return 0 109 return 0
110
111#ifdef __BIG_ENDIAN
112#define __BYTE_ORDER __BIG_ENDIAN
113#else
114#define __BYTE_ORDER __LITTLE_ENDIAN
115#endif
diff --git a/arch/sparc/math-emu/sfp-util_64.h b/arch/sparc/math-emu/sfp-util_64.h
index 425d3cf01af4..d17c9bc72181 100644
--- a/arch/sparc/math-emu/sfp-util_64.h
+++ b/arch/sparc/math-emu/sfp-util_64.h
@@ -112,9 +112,3 @@
112 112
113#define abort() \ 113#define abort() \
114 return 0 114 return 0
115
116#ifdef __BIG_ENDIAN
117#define __BYTE_ORDER __BIG_ENDIAN
118#else
119#define __BYTE_ORDER __LITTLE_ENDIAN
120#endif
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index 89bbf4e4d05d..7b1aaa20c7b5 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -195,11 +195,11 @@ static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym)
195 195
196 196
197 197
198#if BYTE_ORDER == LITTLE_ENDIAN 198#if __BYTE_ORDER == __LITTLE_ENDIAN
199#define le16_to_cpu(val) (val) 199#define le16_to_cpu(val) (val)
200#define le32_to_cpu(val) (val) 200#define le32_to_cpu(val) (val)
201#endif 201#endif
202#if BYTE_ORDER == BIG_ENDIAN 202#if __BYTE_ORDER == __BIG_ENDIAN
203#define le16_to_cpu(val) bswap_16(val) 203#define le16_to_cpu(val) bswap_16(val)
204#define le32_to_cpu(val) bswap_32(val) 204#define le32_to_cpu(val) bswap_32(val)
205#endif 205#endif
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index f9324851eba0..b49d8ca228f6 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -236,6 +236,8 @@
236 236
237#define MSR_IA32_MISC_ENABLE 0x000001a0 237#define MSR_IA32_MISC_ENABLE 0x000001a0
238 238
239#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2
240
239/* MISC_ENABLE bits: architectural */ 241/* MISC_ENABLE bits: architectural */
240#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) 242#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
241#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) 243#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1)
diff --git a/arch/xtensa/include/asm/cache.h b/arch/xtensa/include/asm/cache.h
index f04c9891142f..ed8cd3cbd499 100644
--- a/arch/xtensa/include/asm/cache.h
+++ b/arch/xtensa/include/asm/cache.h
@@ -29,5 +29,6 @@
29# define CACHE_WAY_SIZE ICACHE_WAY_SIZE 29# define CACHE_WAY_SIZE ICACHE_WAY_SIZE
30#endif 30#endif
31 31
32#define ARCH_KMALLOC_MINALIGN L1_CACHE_BYTES
32 33
33#endif /* _XTENSA_CACHE_H */ 34#endif /* _XTENSA_CACHE_H */
diff --git a/arch/xtensa/include/asm/hardirq.h b/arch/xtensa/include/asm/hardirq.h
index 87cb19d1b10c..26664cef8f11 100644
--- a/arch/xtensa/include/asm/hardirq.h
+++ b/arch/xtensa/include/asm/hardirq.h
@@ -11,18 +11,9 @@
11#ifndef _XTENSA_HARDIRQ_H 11#ifndef _XTENSA_HARDIRQ_H
12#define _XTENSA_HARDIRQ_H 12#define _XTENSA_HARDIRQ_H
13 13
14#include <linux/cache.h>
15#include <asm/irq.h>
16
17/* headers.S is sensitive to the offsets of these fields */
18typedef struct {
19 unsigned int __softirq_pending;
20 unsigned int __syscall_count;
21 struct task_struct * __ksoftirqd_task; /* waitqueue is too large */
22 unsigned int __nmi_count; /* arch dependent */
23} ____cacheline_aligned irq_cpustat_t;
24
25void ack_bad_irq(unsigned int irq); 14void ack_bad_irq(unsigned int irq);
26#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ 15#define ack_bad_irq ack_bad_irq
16
17#include <asm-generic/hardirq.h>
27 18
28#endif /* _XTENSA_HARDIRQ_H */ 19#endif /* _XTENSA_HARDIRQ_H */
diff --git a/arch/xtensa/kernel/irq.c b/arch/xtensa/kernel/irq.c
index 8cd38484e130..c64a5d387de5 100644
--- a/arch/xtensa/kernel/irq.c
+++ b/arch/xtensa/kernel/irq.c
@@ -27,15 +27,6 @@ static unsigned int cached_irq_mask;
27atomic_t irq_err_count; 27atomic_t irq_err_count;
28 28
29/* 29/*
30 * 'what should we do if we get a hw irq event on an illegal vector'.
31 * each architecture has to answer this themselves.
32 */
33void ack_bad_irq(unsigned int irq)
34{
35 printk("unexpected IRQ trap at vector %02x\n", irq);
36}
37
38/*
39 * do_IRQ handles all normal device IRQ's (the special 30 * do_IRQ handles all normal device IRQ's (the special
40 * SMP cross-CPU interrupts have their own specific 31 * SMP cross-CPU interrupts have their own specific
41 * handlers). 32 * handlers).
diff --git a/arch/xtensa/kernel/vectors.S b/arch/xtensa/kernel/vectors.S
index 74a7518faf16..70066e3582d0 100644
--- a/arch/xtensa/kernel/vectors.S
+++ b/arch/xtensa/kernel/vectors.S
@@ -44,14 +44,12 @@
44 44
45#include <linux/linkage.h> 45#include <linux/linkage.h>
46#include <asm/ptrace.h> 46#include <asm/ptrace.h>
47#include <asm/ptrace.h>
48#include <asm/current.h> 47#include <asm/current.h>
49#include <asm/asm-offsets.h> 48#include <asm/asm-offsets.h>
50#include <asm/pgtable.h> 49#include <asm/pgtable.h>
51#include <asm/processor.h> 50#include <asm/processor.h>
52#include <asm/page.h> 51#include <asm/page.h>
53#include <asm/thread_info.h> 52#include <asm/thread_info.h>
54#include <asm/processor.h>
55 53
56#define WINDOW_VECTORS_SIZE 0x180 54#define WINDOW_VECTORS_SIZE 0x180
57 55
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 9042a8579668..c1d23cd71652 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -401,11 +401,6 @@ static void acpi_print_osc_error(acpi_handle handle,
401 printk("\n"); 401 printk("\n");
402} 402}
403 403
404static u8 hex_val(unsigned char c)
405{
406 return isdigit(c) ? c - '0' : toupper(c) - 'A' + 10;
407}
408
409static acpi_status acpi_str_to_uuid(char *str, u8 *uuid) 404static acpi_status acpi_str_to_uuid(char *str, u8 *uuid)
410{ 405{
411 int i; 406 int i;
@@ -422,8 +417,8 @@ static acpi_status acpi_str_to_uuid(char *str, u8 *uuid)
422 return AE_BAD_PARAMETER; 417 return AE_BAD_PARAMETER;
423 } 418 }
424 for (i = 0; i < 16; i++) { 419 for (i = 0; i < 16; i++) {
425 uuid[i] = hex_val(str[opc_map_to_uuid[i]]) << 4; 420 uuid[i] = hex_to_bin(str[opc_map_to_uuid[i]]) << 4;
426 uuid[i] |= hex_val(str[opc_map_to_uuid[i] + 1]); 421 uuid[i] |= hex_to_bin(str[opc_map_to_uuid[i] + 1]);
427 } 422 }
428 return AE_OK; 423 return AE_OK;
429} 424}
diff --git a/drivers/auxdisplay/cfag12864bfb.c b/drivers/auxdisplay/cfag12864bfb.c
index 3fecfb446d90..5ad3bad2b0a5 100644
--- a/drivers/auxdisplay/cfag12864bfb.c
+++ b/drivers/auxdisplay/cfag12864bfb.c
@@ -37,7 +37,7 @@
37 37
38#define CFAG12864BFB_NAME "cfag12864bfb" 38#define CFAG12864BFB_NAME "cfag12864bfb"
39 39
40static struct fb_fix_screeninfo cfag12864bfb_fix __initdata = { 40static struct fb_fix_screeninfo cfag12864bfb_fix __devinitdata = {
41 .id = "cfag12864b", 41 .id = "cfag12864b",
42 .type = FB_TYPE_PACKED_PIXELS, 42 .type = FB_TYPE_PACKED_PIXELS,
43 .visual = FB_VISUAL_MONO10, 43 .visual = FB_VISUAL_MONO10,
@@ -48,7 +48,7 @@ static struct fb_fix_screeninfo cfag12864bfb_fix __initdata = {
48 .accel = FB_ACCEL_NONE, 48 .accel = FB_ACCEL_NONE,
49}; 49};
50 50
51static struct fb_var_screeninfo cfag12864bfb_var __initdata = { 51static struct fb_var_screeninfo cfag12864bfb_var __devinitdata = {
52 .xres = CFAG12864B_WIDTH, 52 .xres = CFAG12864B_WIDTH,
53 .yres = CFAG12864B_HEIGHT, 53 .yres = CFAG12864B_HEIGHT,
54 .xres_virtual = CFAG12864B_WIDTH, 54 .xres_virtual = CFAG12864B_WIDTH,
@@ -114,7 +114,7 @@ none:
114 return ret; 114 return ret;
115} 115}
116 116
117static int cfag12864bfb_remove(struct platform_device *device) 117static int __devexit cfag12864bfb_remove(struct platform_device *device)
118{ 118{
119 struct fb_info *info = platform_get_drvdata(device); 119 struct fb_info *info = platform_get_drvdata(device);
120 120
@@ -128,7 +128,7 @@ static int cfag12864bfb_remove(struct platform_device *device)
128 128
129static struct platform_driver cfag12864bfb_driver = { 129static struct platform_driver cfag12864bfb_driver = {
130 .probe = cfag12864bfb_probe, 130 .probe = cfag12864bfb_probe,
131 .remove = cfag12864bfb_remove, 131 .remove = __devexit_p(cfag12864bfb_remove),
132 .driver = { 132 .driver = {
133 .name = CFAG12864BFB_NAME, 133 .name = CFAG12864BFB_NAME,
134 }, 134 },
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 057979a19eea..2bdd8a94ec94 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -9,6 +9,7 @@
9#include <linux/memory.h> 9#include <linux/memory.h>
10#include <linux/node.h> 10#include <linux/node.h>
11#include <linux/hugetlb.h> 11#include <linux/hugetlb.h>
12#include <linux/compaction.h>
12#include <linux/cpumask.h> 13#include <linux/cpumask.h>
13#include <linux/topology.h> 14#include <linux/topology.h>
14#include <linux/nodemask.h> 15#include <linux/nodemask.h>
@@ -246,6 +247,8 @@ int register_node(struct node *node, int num, struct node *parent)
246 scan_unevictable_register_node(node); 247 scan_unevictable_register_node(node);
247 248
248 hugetlb_register_node(node); 249 hugetlb_register_node(node);
250
251 compaction_register_node(node);
249 } 252 }
250 return error; 253 return error;
251} 254}
diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c
index 712d9f271aa6..e0249722d25f 100644
--- a/drivers/char/hangcheck-timer.c
+++ b/drivers/char/hangcheck-timer.c
@@ -49,8 +49,9 @@
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
50#include <linux/sysrq.h> 50#include <linux/sysrq.h>
51#include <linux/timer.h> 51#include <linux/timer.h>
52#include <linux/time.h>
52 53
53#define VERSION_STR "0.9.0" 54#define VERSION_STR "0.9.1"
54 55
55#define DEFAULT_IOFENCE_MARGIN 60 /* Default fudge factor, in seconds */ 56#define DEFAULT_IOFENCE_MARGIN 60 /* Default fudge factor, in seconds */
56#define DEFAULT_IOFENCE_TICK 180 /* Default timer timeout, in seconds */ 57#define DEFAULT_IOFENCE_TICK 180 /* Default timer timeout, in seconds */
@@ -119,10 +120,8 @@ __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks);
119#if defined(CONFIG_S390) 120#if defined(CONFIG_S390)
120# define HAVE_MONOTONIC 121# define HAVE_MONOTONIC
121# define TIMER_FREQ 1000000000ULL 122# define TIMER_FREQ 1000000000ULL
122#elif defined(CONFIG_IA64)
123# define TIMER_FREQ ((unsigned long long)local_cpu_data->itc_freq)
124#else 123#else
125# define TIMER_FREQ (HZ*loops_per_jiffy) 124# define TIMER_FREQ 1000000000ULL
126#endif 125#endif
127 126
128#ifdef HAVE_MONOTONIC 127#ifdef HAVE_MONOTONIC
@@ -130,7 +129,9 @@ extern unsigned long long monotonic_clock(void);
130#else 129#else
131static inline unsigned long long monotonic_clock(void) 130static inline unsigned long long monotonic_clock(void)
132{ 131{
133 return get_cycles(); 132 struct timespec ts;
133 getrawmonotonic(&ts);
134 return timespec_to_ns(&ts);
134} 135}
135#endif /* HAVE_MONOTONIC */ 136#endif /* HAVE_MONOTONIC */
136 137
@@ -168,6 +169,13 @@ static void hangcheck_fire(unsigned long data)
168 printk(KERN_CRIT "Hangcheck: hangcheck value past margin!\n"); 169 printk(KERN_CRIT "Hangcheck: hangcheck value past margin!\n");
169 } 170 }
170 } 171 }
172#if 0
173 /*
174 * Enable to investigate delays in detail
175 */
176 printk("Hangcheck: called %Ld ns since last time (%Ld ns overshoot)\n",
177 tsc_diff, tsc_diff - hangcheck_tick*TIMER_FREQ);
178#endif
171 mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ)); 179 mod_timer(&hangcheck_ticktock, jiffies + (hangcheck_tick*HZ));
172 hangcheck_tsc = monotonic_clock(); 180 hangcheck_tsc = monotonic_clock();
173} 181}
@@ -180,7 +188,7 @@ static int __init hangcheck_init(void)
180#if defined (HAVE_MONOTONIC) 188#if defined (HAVE_MONOTONIC)
181 printk("Hangcheck: Using monotonic_clock().\n"); 189 printk("Hangcheck: Using monotonic_clock().\n");
182#else 190#else
183 printk("Hangcheck: Using get_cycles().\n"); 191 printk("Hangcheck: Using getrawmonotonic().\n");
184#endif /* HAVE_MONOTONIC */ 192#endif /* HAVE_MONOTONIC */
185 hangcheck_tsc_margin = 193 hangcheck_tsc_margin =
186 (unsigned long long)(hangcheck_margin + hangcheck_tick); 194 (unsigned long long)(hangcheck_margin + hangcheck_tick);
diff --git a/drivers/char/hvsi.c b/drivers/char/hvsi.c
index 793b236c9266..d4b14ff1c4c1 100644
--- a/drivers/char/hvsi.c
+++ b/drivers/char/hvsi.c
@@ -194,10 +194,8 @@ static inline void print_state(struct hvsi_struct *hp)
194 "HVSI_WAIT_FOR_MCTRL_RESPONSE", 194 "HVSI_WAIT_FOR_MCTRL_RESPONSE",
195 "HVSI_FSP_DIED", 195 "HVSI_FSP_DIED",
196 }; 196 };
197 const char *name = state_names[hp->state]; 197 const char *name = (hp->state < ARRAY_SIZE(state_names))
198 198 ? state_names[hp->state] : "UNKNOWN";
199 if (hp->state > ARRAY_SIZE(state_names))
200 name = "UNKNOWN";
201 199
202 pr_debug("hvsi%i: state = %s\n", hp->index, name); 200 pr_debug("hvsi%i: state = %s\n", hp->index, name);
203#endif /* DEBUG */ 201#endif /* DEBUG */
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 92ab03d28294..cd650ca8c679 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -144,6 +144,7 @@ static int misc_open(struct inode * inode, struct file * file)
144 old_fops = file->f_op; 144 old_fops = file->f_op;
145 file->f_op = new_fops; 145 file->f_op = new_fops;
146 if (file->f_op->open) { 146 if (file->f_op->open) {
147 file->private_data = c;
147 err=file->f_op->open(inode,file); 148 err=file->f_op->open(inode,file);
148 if (err) { 149 if (err) {
149 fops_put(file->f_op); 150 fops_put(file->f_op);
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index b81ad9c731ae..52ff8aa63f84 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -21,9 +21,12 @@
21#include <linux/math64.h> 21#include <linux/math64.h>
22 22
23#define BUCKETS 12 23#define BUCKETS 12
24#define INTERVALS 8
24#define RESOLUTION 1024 25#define RESOLUTION 1024
25#define DECAY 4 26#define DECAY 8
26#define MAX_INTERESTING 50000 27#define MAX_INTERESTING 50000
28#define STDDEV_THRESH 400
29
27 30
28/* 31/*
29 * Concepts and ideas behind the menu governor 32 * Concepts and ideas behind the menu governor
@@ -64,6 +67,16 @@
64 * indexed based on the magnitude of the expected duration as well as the 67 * indexed based on the magnitude of the expected duration as well as the
65 * "is IO outstanding" property. 68 * "is IO outstanding" property.
66 * 69 *
70 * Repeatable-interval-detector
71 * ----------------------------
72 * There are some cases where "next timer" is a completely unusable predictor:
73 * Those cases where the interval is fixed, for example due to hardware
74 * interrupt mitigation, but also due to fixed transfer rate devices such as
75 * mice.
76 * For this, we use a different predictor: We track the duration of the last 8
77 * intervals and if the stand deviation of these 8 intervals is below a
78 * threshold value, we use the average of these intervals as prediction.
79 *
67 * Limiting Performance Impact 80 * Limiting Performance Impact
68 * --------------------------- 81 * ---------------------------
69 * C states, especially those with large exit latencies, can have a real 82 * C states, especially those with large exit latencies, can have a real
@@ -104,6 +117,8 @@ struct menu_device {
104 unsigned int exit_us; 117 unsigned int exit_us;
105 unsigned int bucket; 118 unsigned int bucket;
106 u64 correction_factor[BUCKETS]; 119 u64 correction_factor[BUCKETS];
120 u32 intervals[INTERVALS];
121 int interval_ptr;
107}; 122};
108 123
109 124
@@ -175,6 +190,42 @@ static u64 div_round64(u64 dividend, u32 divisor)
175 return div_u64(dividend + (divisor / 2), divisor); 190 return div_u64(dividend + (divisor / 2), divisor);
176} 191}
177 192
193/*
194 * Try detecting repeating patterns by keeping track of the last 8
195 * intervals, and checking if the standard deviation of that set
196 * of points is below a threshold. If it is... then use the
197 * average of these 8 points as the estimated value.
198 */
199static void detect_repeating_patterns(struct menu_device *data)
200{
201 int i;
202 uint64_t avg = 0;
203 uint64_t stddev = 0; /* contains the square of the std deviation */
204
205 /* first calculate average and standard deviation of the past */
206 for (i = 0; i < INTERVALS; i++)
207 avg += data->intervals[i];
208 avg = avg / INTERVALS;
209
210 /* if the avg is beyond the known next tick, it's worthless */
211 if (avg > data->expected_us)
212 return;
213
214 for (i = 0; i < INTERVALS; i++)
215 stddev += (data->intervals[i] - avg) *
216 (data->intervals[i] - avg);
217
218 stddev = stddev / INTERVALS;
219
220 /*
221 * now.. if stddev is small.. then assume we have a
222 * repeating pattern and predict we keep doing this.
223 */
224
225 if (avg && stddev < STDDEV_THRESH)
226 data->predicted_us = avg;
227}
228
178/** 229/**
179 * menu_select - selects the next idle state to enter 230 * menu_select - selects the next idle state to enter
180 * @dev: the CPU 231 * @dev: the CPU
@@ -218,6 +269,8 @@ static int menu_select(struct cpuidle_device *dev)
218 data->predicted_us = div_round64(data->expected_us * data->correction_factor[data->bucket], 269 data->predicted_us = div_round64(data->expected_us * data->correction_factor[data->bucket],
219 RESOLUTION * DECAY); 270 RESOLUTION * DECAY);
220 271
272 detect_repeating_patterns(data);
273
221 /* 274 /*
222 * We want to default to C1 (hlt), not to busy polling 275 * We want to default to C1 (hlt), not to busy polling
223 * unless the timer is happening really really soon. 276 * unless the timer is happening really really soon.
@@ -310,6 +363,11 @@ static void menu_update(struct cpuidle_device *dev)
310 new_factor = 1; 363 new_factor = 1;
311 364
312 data->correction_factor[data->bucket] = new_factor; 365 data->correction_factor[data->bucket] = new_factor;
366
367 /* update the repeating-pattern data */
368 data->intervals[data->interval_ptr++] = last_idle_us;
369 if (data->interval_ptr >= INTERVALS)
370 data->interval_ptr = 0;
313} 371}
314 372
315/** 373/**
diff --git a/drivers/dma/timb_dma.c b/drivers/dma/timb_dma.c
index 0172fa3c7a2b..a1bf77c1993f 100644
--- a/drivers/dma/timb_dma.c
+++ b/drivers/dma/timb_dma.c
@@ -188,7 +188,7 @@ static void __td_unmap_descs(struct timb_dma_desc *td_desc, bool single)
188static int td_fill_desc(struct timb_dma_chan *td_chan, u8 *dma_desc, 188static int td_fill_desc(struct timb_dma_chan *td_chan, u8 *dma_desc,
189 struct scatterlist *sg, bool last) 189 struct scatterlist *sg, bool last)
190{ 190{
191 if (sg_dma_len(sg) > USHORT_MAX) { 191 if (sg_dma_len(sg) > USHRT_MAX) {
192 dev_err(chan2dev(&td_chan->chan), "Too big sg element\n"); 192 dev_err(chan2dev(&td_chan->chan), "Too big sg element\n");
193 return -EINVAL; 193 return -EINVAL;
194 } 194 }
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 9be8e1754a0b..6a9ac754ca5d 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -802,6 +802,15 @@ config SENSORS_ADS7828
802 This driver can also be built as a module. If so, the module 802 This driver can also be built as a module. If so, the module
803 will be called ads7828. 803 will be called ads7828.
804 804
805config SENSORS_ADS7871
806 tristate "Texas Instruments ADS7871 A/D converter"
807 depends on SPI
808 help
809 If you say yes here you get support for TI ADS7871 & ADS7870
810
811 This driver can also be built as a module. If so, the module
812 will be called ads7871.
813
805config SENSORS_AMC6821 814config SENSORS_AMC6821
806 tristate "Texas Instruments AMC6821" 815 tristate "Texas Instruments AMC6821"
807 depends on I2C && EXPERIMENTAL 816 depends on I2C && EXPERIMENTAL
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index 4aa1a3d112ad..86920fb34118 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_SENSORS_ADM1029) += adm1029.o
29obj-$(CONFIG_SENSORS_ADM1031) += adm1031.o 29obj-$(CONFIG_SENSORS_ADM1031) += adm1031.o
30obj-$(CONFIG_SENSORS_ADM9240) += adm9240.o 30obj-$(CONFIG_SENSORS_ADM9240) += adm9240.o
31obj-$(CONFIG_SENSORS_ADS7828) += ads7828.o 31obj-$(CONFIG_SENSORS_ADS7828) += ads7828.o
32obj-$(CONFIG_SENSORS_ADS7871) += ads7871.o
32obj-$(CONFIG_SENSORS_ADT7411) += adt7411.o 33obj-$(CONFIG_SENSORS_ADT7411) += adt7411.o
33obj-$(CONFIG_SENSORS_ADT7462) += adt7462.o 34obj-$(CONFIG_SENSORS_ADT7462) += adt7462.o
34obj-$(CONFIG_SENSORS_ADT7470) += adt7470.o 35obj-$(CONFIG_SENSORS_ADT7470) += adt7470.o
diff --git a/drivers/hwmon/ads7871.c b/drivers/hwmon/ads7871.c
new file mode 100644
index 000000000000..b300a2048af1
--- /dev/null
+++ b/drivers/hwmon/ads7871.c
@@ -0,0 +1,253 @@
1/*
2 * ads7871 - driver for TI ADS7871 A/D converter
3 *
4 * Copyright (c) 2010 Paul Thomas <pthomas8589@gmail.com>
5 *
6 * This program is distributed in the hope that it will be useful,
7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 * GNU General Public License for more details.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2 or
13 * later as publishhed by the Free Software Foundation.
14 *
15 * You need to have something like this in struct spi_board_info
16 * {
17 * .modalias = "ads7871",
18 * .max_speed_hz = 2*1000*1000,
19 * .chip_select = 0,
20 * .bus_num = 1,
21 * },
22 */
23
24/*From figure 18 in the datasheet*/
25/*Register addresses*/
26#define REG_LS_BYTE 0 /*A/D Output Data, LS Byte*/
27#define REG_MS_BYTE 1 /*A/D Output Data, MS Byte*/
28#define REG_PGA_VALID 2 /*PGA Valid Register*/
29#define REG_AD_CONTROL 3 /*A/D Control Register*/
30#define REG_GAIN_MUX 4 /*Gain/Mux Register*/
31#define REG_IO_STATE 5 /*Digital I/O State Register*/
32#define REG_IO_CONTROL 6 /*Digital I/O Control Register*/
33#define REG_OSC_CONTROL 7 /*Rev/Oscillator Control Register*/
34#define REG_SER_CONTROL 24 /*Serial Interface Control Register*/
35#define REG_ID 31 /*ID Register*/
36
37/*From figure 17 in the datasheet
38* These bits get ORed with the address to form
39* the instruction byte */
40/*Instruction Bit masks*/
41#define INST_MODE_bm (1<<7)
42#define INST_READ_bm (1<<6)
43#define INST_16BIT_bm (1<<5)
44
45/*From figure 18 in the datasheet*/
46/*bit masks for Rev/Oscillator Control Register*/
47#define MUX_CNV_bv 7
48#define MUX_CNV_bm (1<<MUX_CNV_bv)
49#define MUX_M3_bm (1<<3) /*M3 selects single ended*/
50#define MUX_G_bv 4 /*allows for reg = (gain << MUX_G_bv) | ...*/
51
52/*From figure 18 in the datasheet*/
53/*bit masks for Rev/Oscillator Control Register*/
54#define OSC_OSCR_bm (1<<5)
55#define OSC_OSCE_bm (1<<4)
56#define OSC_REFE_bm (1<<3)
57#define OSC_BUFE_bm (1<<2)
58#define OSC_R2V_bm (1<<1)
59#define OSC_RBG_bm (1<<0)
60
61#include <linux/module.h>
62#include <linux/init.h>
63#include <linux/spi/spi.h>
64#include <linux/hwmon.h>
65#include <linux/hwmon-sysfs.h>
66#include <linux/err.h>
67#include <linux/mutex.h>
68#include <linux/delay.h>
69
70#define DEVICE_NAME "ads7871"
71
72struct ads7871_data {
73 struct device *hwmon_dev;
74 struct mutex update_lock;
75};
76
77static int ads7871_read_reg8(struct spi_device *spi, int reg)
78{
79 int ret;
80 reg = reg | INST_READ_bm;
81 ret = spi_w8r8(spi, reg);
82 return ret;
83}
84
85static int ads7871_read_reg16(struct spi_device *spi, int reg)
86{
87 int ret;
88 reg = reg | INST_READ_bm | INST_16BIT_bm;
89 ret = spi_w8r16(spi, reg);
90 return ret;
91}
92
93static int ads7871_write_reg8(struct spi_device *spi, int reg, u8 val)
94{
95 u8 tmp[2] = {reg, val};
96 return spi_write(spi, tmp, sizeof(tmp));
97}
98
99static ssize_t show_voltage(struct device *dev,
100 struct device_attribute *da, char *buf)
101{
102 struct spi_device *spi = to_spi_device(dev);
103 struct sensor_device_attribute *attr = to_sensor_dev_attr(da);
104 int ret, val, i = 0;
105 uint8_t channel, mux_cnv;
106
107 channel = attr->index;
108 /*TODO: add support for conversions
109 *other than single ended with a gain of 1*/
110 /*MUX_M3_bm forces single ended*/
111 /*This is also where the gain of the PGA would be set*/
112 ads7871_write_reg8(spi, REG_GAIN_MUX,
113 (MUX_CNV_bm | MUX_M3_bm | channel));
114
115 ret = ads7871_read_reg8(spi, REG_GAIN_MUX);
116 mux_cnv = ((ret & MUX_CNV_bm)>>MUX_CNV_bv);
117 /*on 400MHz arm9 platform the conversion
118 *is already done when we do this test*/
119 while ((i < 2) && mux_cnv) {
120 i++;
121 ret = ads7871_read_reg8(spi, REG_GAIN_MUX);
122 mux_cnv = ((ret & MUX_CNV_bm)>>MUX_CNV_bv);
123 msleep_interruptible(1);
124 }
125
126 if (mux_cnv == 0) {
127 val = ads7871_read_reg16(spi, REG_LS_BYTE);
128 /*result in volts*10000 = (val/8192)*2.5*10000*/
129 val = ((val>>2) * 25000) / 8192;
130 return sprintf(buf, "%d\n", val);
131 } else {
132 return -1;
133 }
134}
135
136static SENSOR_DEVICE_ATTR(in0_input, S_IRUGO, show_voltage, NULL, 0);
137static SENSOR_DEVICE_ATTR(in1_input, S_IRUGO, show_voltage, NULL, 1);
138static SENSOR_DEVICE_ATTR(in2_input, S_IRUGO, show_voltage, NULL, 2);
139static SENSOR_DEVICE_ATTR(in3_input, S_IRUGO, show_voltage, NULL, 3);
140static SENSOR_DEVICE_ATTR(in4_input, S_IRUGO, show_voltage, NULL, 4);
141static SENSOR_DEVICE_ATTR(in5_input, S_IRUGO, show_voltage, NULL, 5);
142static SENSOR_DEVICE_ATTR(in6_input, S_IRUGO, show_voltage, NULL, 6);
143static SENSOR_DEVICE_ATTR(in7_input, S_IRUGO, show_voltage, NULL, 7);
144
145static struct attribute *ads7871_attributes[] = {
146 &sensor_dev_attr_in0_input.dev_attr.attr,
147 &sensor_dev_attr_in1_input.dev_attr.attr,
148 &sensor_dev_attr_in2_input.dev_attr.attr,
149 &sensor_dev_attr_in3_input.dev_attr.attr,
150 &sensor_dev_attr_in4_input.dev_attr.attr,
151 &sensor_dev_attr_in5_input.dev_attr.attr,
152 &sensor_dev_attr_in6_input.dev_attr.attr,
153 &sensor_dev_attr_in7_input.dev_attr.attr,
154 NULL
155};
156
157static const struct attribute_group ads7871_group = {
158 .attrs = ads7871_attributes,
159};
160
161static int __devinit ads7871_probe(struct spi_device *spi)
162{
163 int status, ret, err = 0;
164 uint8_t val;
165 struct ads7871_data *pdata;
166
167 dev_dbg(&spi->dev, "probe\n");
168
169 pdata = kzalloc(sizeof(struct ads7871_data), GFP_KERNEL);
170 if (!pdata) {
171 err = -ENOMEM;
172 goto exit;
173 }
174
175 status = sysfs_create_group(&spi->dev.kobj, &ads7871_group);
176 if (status < 0)
177 goto error_free;
178
179 pdata->hwmon_dev = hwmon_device_register(&spi->dev);
180 if (IS_ERR(pdata->hwmon_dev)) {
181 err = PTR_ERR(pdata->hwmon_dev);
182 goto error_remove;
183 }
184
185 spi_set_drvdata(spi, pdata);
186
187 /* Configure the SPI bus */
188 spi->mode = (SPI_MODE_0);
189 spi->bits_per_word = 8;
190 spi_setup(spi);
191
192 ads7871_write_reg8(spi, REG_SER_CONTROL, 0);
193 ads7871_write_reg8(spi, REG_AD_CONTROL, 0);
194
195 val = (OSC_OSCR_bm | OSC_OSCE_bm | OSC_REFE_bm | OSC_BUFE_bm);
196 ads7871_write_reg8(spi, REG_OSC_CONTROL, val);
197 ret = ads7871_read_reg8(spi, REG_OSC_CONTROL);
198
199 dev_dbg(&spi->dev, "REG_OSC_CONTROL write:%x, read:%x\n", val, ret);
200 /*because there is no other error checking on an SPI bus
201 we need to make sure we really have a chip*/
202 if (val != ret) {
203 err = -ENODEV;
204 goto error_remove;
205 }
206
207 return 0;
208
209error_remove:
210 sysfs_remove_group(&spi->dev.kobj, &ads7871_group);
211error_free:
212 kfree(pdata);
213exit:
214 return err;
215}
216
217static int __devexit ads7871_remove(struct spi_device *spi)
218{
219 struct ads7871_data *pdata = spi_get_drvdata(spi);
220
221 hwmon_device_unregister(pdata->hwmon_dev);
222 sysfs_remove_group(&spi->dev.kobj, &ads7871_group);
223 kfree(pdata);
224 return 0;
225}
226
227static struct spi_driver ads7871_driver = {
228 .driver = {
229 .name = DEVICE_NAME,
230 .bus = &spi_bus_type,
231 .owner = THIS_MODULE,
232 },
233
234 .probe = ads7871_probe,
235 .remove = __devexit_p(ads7871_remove),
236};
237
238static int __init ads7871_init(void)
239{
240 return spi_register_driver(&ads7871_driver);
241}
242
243static void __exit ads7871_exit(void)
244{
245 spi_unregister_driver(&ads7871_driver);
246}
247
248module_init(ads7871_init);
249module_exit(ads7871_exit);
250
251MODULE_AUTHOR("Paul Thomas <pthomas8589@gmail.com>");
252MODULE_DESCRIPTION("TI ADS7871 A/D driver");
253MODULE_LICENSE("GPL");
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index e9b7fbc5a447..2988da150ed6 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -241,6 +241,55 @@ static int __devinit adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *
241 return tjmax; 241 return tjmax;
242} 242}
243 243
244static int __devinit get_tjmax(struct cpuinfo_x86 *c, u32 id,
245 struct device *dev)
246{
247 /* The 100C is default for both mobile and non mobile CPUs */
248 int err;
249 u32 eax, edx;
250 u32 val;
251
252 /* A new feature of current Intel(R) processors, the
253 IA32_TEMPERATURE_TARGET contains the TjMax value */
254 err = rdmsr_safe_on_cpu(id, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
255 if (err) {
256 dev_warn(dev, "Unable to read TjMax from CPU.\n");
257 } else {
258 val = (eax >> 16) & 0xff;
259 /*
260 * If the TjMax is not plausible, an assumption
261 * will be used
262 */
263 if ((val > 80) && (val < 120)) {
264 dev_info(dev, "TjMax is %d C.\n", val);
265 return val * 1000;
266 }
267 }
268
269 /*
270 * An assumption is made for early CPUs and unreadable MSR.
271 * NOTE: the given value may not be correct.
272 */
273
274 switch (c->x86_model) {
275 case 0xe:
276 case 0xf:
277 case 0x16:
278 case 0x1a:
279 dev_warn(dev, "TjMax is assumed as 100 C!\n");
280 return 100000;
281 break;
282 case 0x17:
283 case 0x1c: /* Atom CPUs */
284 return adjust_tjmax(c, id, dev);
285 break;
286 default:
287 dev_warn(dev, "CPU (model=0x%x) is not supported yet,"
288 " using default TjMax of 100C.\n", c->x86_model);
289 return 100000;
290 }
291}
292
244static int __devinit coretemp_probe(struct platform_device *pdev) 293static int __devinit coretemp_probe(struct platform_device *pdev)
245{ 294{
246 struct coretemp_data *data; 295 struct coretemp_data *data;
@@ -283,14 +332,18 @@ static int __devinit coretemp_probe(struct platform_device *pdev)
283 } 332 }
284 } 333 }
285 334
286 data->tjmax = adjust_tjmax(c, data->id, &pdev->dev); 335 data->tjmax = get_tjmax(c, data->id, &pdev->dev);
287 platform_set_drvdata(pdev, data); 336 platform_set_drvdata(pdev, data);
288 337
289 /* read the still undocumented IA32_TEMPERATURE_TARGET it exists 338 /*
290 on older CPUs but not in this register, Atoms don't have it either */ 339 * read the still undocumented IA32_TEMPERATURE_TARGET. It exists
340 * on older CPUs but not in this register,
341 * Atoms don't have it either.
342 */
291 343
292 if ((c->x86_model > 0xe) && (c->x86_model != 0x1c)) { 344 if ((c->x86_model > 0xe) && (c->x86_model != 0x1c)) {
293 err = rdmsr_safe_on_cpu(data->id, 0x1a2, &eax, &edx); 345 err = rdmsr_safe_on_cpu(data->id, MSR_IA32_TEMPERATURE_TARGET,
346 &eax, &edx);
294 if (err) { 347 if (err) {
295 dev_warn(&pdev->dev, "Unable to read" 348 dev_warn(&pdev->dev, "Unable to read"
296 " IA32_TEMPERATURE_TARGET MSR\n"); 349 " IA32_TEMPERATURE_TARGET MSR\n");
@@ -451,28 +504,20 @@ static int __init coretemp_init(void)
451 504
452 for_each_online_cpu(i) { 505 for_each_online_cpu(i) {
453 struct cpuinfo_x86 *c = &cpu_data(i); 506 struct cpuinfo_x86 *c = &cpu_data(i);
507 /*
508 * CPUID.06H.EAX[0] indicates whether the CPU has thermal
509 * sensors. We check this bit only, all the early CPUs
510 * without thermal sensors will be filtered out.
511 */
512 if (c->cpuid_level >= 6 && (cpuid_eax(0x06) & 0x01)) {
513 err = coretemp_device_add(i);
514 if (err)
515 goto exit_devices_unreg;
454 516
455 /* check if family 6, models 0xe (Pentium M DC), 517 } else {
456 0xf (Core 2 DC 65nm), 0x16 (Core 2 SC 65nm), 518 printk(KERN_INFO DRVNAME ": CPU (model=0x%x)"
457 0x17 (Penryn 45nm), 0x1a (Nehalem), 0x1c (Atom), 519 " has no thermal sensor.\n", c->x86_model);
458 0x1e (Lynnfield) */
459 if ((c->cpuid_level < 0) || (c->x86 != 0x6) ||
460 !((c->x86_model == 0xe) || (c->x86_model == 0xf) ||
461 (c->x86_model == 0x16) || (c->x86_model == 0x17) ||
462 (c->x86_model == 0x1a) || (c->x86_model == 0x1c) ||
463 (c->x86_model == 0x1e))) {
464
465 /* supported CPU not found, but report the unknown
466 family 6 CPU */
467 if ((c->x86 == 0x6) && (c->x86_model > 0xf))
468 printk(KERN_WARNING DRVNAME ": Unknown CPU "
469 "model 0x%x\n", c->x86_model);
470 continue;
471 } 520 }
472
473 err = coretemp_device_add(i);
474 if (err)
475 goto exit_devices_unreg;
476 } 521 }
477 if (list_empty(&pdev_list)) { 522 if (list_empty(&pdev_list)) {
478 err = -ENODEV; 523 err = -ENODEV;
diff --git a/drivers/hwmon/lis3lv02d.c b/drivers/hwmon/lis3lv02d.c
index b2f2277cad3c..6138f036b159 100644
--- a/drivers/hwmon/lis3lv02d.c
+++ b/drivers/hwmon/lis3lv02d.c
@@ -41,6 +41,8 @@
41 41
42/* joystick device poll interval in milliseconds */ 42/* joystick device poll interval in milliseconds */
43#define MDPS_POLL_INTERVAL 50 43#define MDPS_POLL_INTERVAL 50
44#define MDPS_POLL_MIN 0
45#define MDPS_POLL_MAX 2000
44/* 46/*
45 * The sensor can also generate interrupts (DRDY) but it's pretty pointless 47 * The sensor can also generate interrupts (DRDY) but it's pretty pointless
46 * because they are generated even if the data do not change. So it's better 48 * because they are generated even if the data do not change. So it's better
@@ -121,11 +123,9 @@ static void lis3lv02d_get_xyz(struct lis3lv02d *lis3, int *x, int *y, int *z)
121 int position[3]; 123 int position[3];
122 int i; 124 int i;
123 125
124 mutex_lock(&lis3->mutex);
125 position[0] = lis3->read_data(lis3, OUTX); 126 position[0] = lis3->read_data(lis3, OUTX);
126 position[1] = lis3->read_data(lis3, OUTY); 127 position[1] = lis3->read_data(lis3, OUTY);
127 position[2] = lis3->read_data(lis3, OUTZ); 128 position[2] = lis3->read_data(lis3, OUTZ);
128 mutex_unlock(&lis3->mutex);
129 129
130 for (i = 0; i < 3; i++) 130 for (i = 0; i < 3; i++)
131 position[i] = (position[i] * lis3->scale) / LIS3_ACCURACY; 131 position[i] = (position[i] * lis3->scale) / LIS3_ACCURACY;
@@ -249,8 +249,24 @@ void lis3lv02d_poweron(struct lis3lv02d *lis3)
249EXPORT_SYMBOL_GPL(lis3lv02d_poweron); 249EXPORT_SYMBOL_GPL(lis3lv02d_poweron);
250 250
251 251
252static void lis3lv02d_joystick_poll(struct input_polled_dev *pidev)
253{
254 int x, y, z;
255
256 mutex_lock(&lis3_dev.mutex);
257 lis3lv02d_get_xyz(&lis3_dev, &x, &y, &z);
258 input_report_abs(pidev->input, ABS_X, x);
259 input_report_abs(pidev->input, ABS_Y, y);
260 input_report_abs(pidev->input, ABS_Z, z);
261 input_sync(pidev->input);
262 mutex_unlock(&lis3_dev.mutex);
263}
264
252static irqreturn_t lis302dl_interrupt(int irq, void *dummy) 265static irqreturn_t lis302dl_interrupt(int irq, void *dummy)
253{ 266{
267 if (!test_bit(0, &lis3_dev.misc_opened))
268 goto out;
269
254 /* 270 /*
255 * Be careful: on some HP laptops the bios force DD when on battery and 271 * Be careful: on some HP laptops the bios force DD when on battery and
256 * the lid is closed. This leads to interrupts as soon as a little move 272 * the lid is closed. This leads to interrupts as soon as a little move
@@ -260,44 +276,93 @@ static irqreturn_t lis302dl_interrupt(int irq, void *dummy)
260 276
261 wake_up_interruptible(&lis3_dev.misc_wait); 277 wake_up_interruptible(&lis3_dev.misc_wait);
262 kill_fasync(&lis3_dev.async_queue, SIGIO, POLL_IN); 278 kill_fasync(&lis3_dev.async_queue, SIGIO, POLL_IN);
279out:
280 if (lis3_dev.whoami == WAI_8B && lis3_dev.idev &&
281 lis3_dev.idev->input->users)
282 return IRQ_WAKE_THREAD;
263 return IRQ_HANDLED; 283 return IRQ_HANDLED;
264} 284}
265 285
266static int lis3lv02d_misc_open(struct inode *inode, struct file *file) 286static void lis302dl_interrupt_handle_click(struct lis3lv02d *lis3)
267{ 287{
268 int ret; 288 struct input_dev *dev = lis3->idev->input;
289 u8 click_src;
269 290
270 if (test_and_set_bit(0, &lis3_dev.misc_opened)) 291 mutex_lock(&lis3->mutex);
271 return -EBUSY; /* already open */ 292 lis3->read(lis3, CLICK_SRC, &click_src);
272 293
273 atomic_set(&lis3_dev.count, 0); 294 if (click_src & CLICK_SINGLE_X) {
295 input_report_key(dev, lis3->mapped_btns[0], 1);
296 input_report_key(dev, lis3->mapped_btns[0], 0);
297 }
274 298
275 /* 299 if (click_src & CLICK_SINGLE_Y) {
276 * The sensor can generate interrupts for free-fall and direction 300 input_report_key(dev, lis3->mapped_btns[1], 1);
277 * detection (distinguishable with FF_WU_SRC and DD_SRC) but to keep 301 input_report_key(dev, lis3->mapped_btns[1], 0);
278 * the things simple and _fast_ we activate it only for free-fall, so 302 }
279 * no need to read register (very slow with ACPI). For the same reason,
280 * we forbid shared interrupts.
281 *
282 * IRQF_TRIGGER_RISING seems pointless on HP laptops because the
283 * io-apic is not configurable (and generates a warning) but I keep it
284 * in case of support for other hardware.
285 */
286 ret = request_irq(lis3_dev.irq, lis302dl_interrupt, IRQF_TRIGGER_RISING,
287 DRIVER_NAME, &lis3_dev);
288 303
289 if (ret) { 304 if (click_src & CLICK_SINGLE_Z) {
290 clear_bit(0, &lis3_dev.misc_opened); 305 input_report_key(dev, lis3->mapped_btns[2], 1);
291 printk(KERN_ERR DRIVER_NAME ": IRQ%d allocation failed\n", lis3_dev.irq); 306 input_report_key(dev, lis3->mapped_btns[2], 0);
292 return -EBUSY;
293 } 307 }
308 input_sync(dev);
309 mutex_unlock(&lis3->mutex);
310}
311
312static void lis302dl_interrupt_handle_ff_wu(struct lis3lv02d *lis3)
313{
314 u8 wu1_src;
315 u8 wu2_src;
316
317 lis3->read(lis3, FF_WU_SRC_1, &wu1_src);
318 lis3->read(lis3, FF_WU_SRC_2, &wu2_src);
319
320 wu1_src = wu1_src & FF_WU_SRC_IA ? wu1_src : 0;
321 wu2_src = wu2_src & FF_WU_SRC_IA ? wu2_src : 0;
322
323 /* joystick poll is internally protected by the lis3->mutex. */
324 if (wu1_src || wu2_src)
325 lis3lv02d_joystick_poll(lis3_dev.idev);
326}
327
328static irqreturn_t lis302dl_interrupt_thread1_8b(int irq, void *data)
329{
330
331 struct lis3lv02d *lis3 = data;
332
333 if ((lis3->pdata->irq_cfg & LIS3_IRQ1_MASK) == LIS3_IRQ1_CLICK)
334 lis302dl_interrupt_handle_click(lis3);
335 else
336 lis302dl_interrupt_handle_ff_wu(lis3);
337
338 return IRQ_HANDLED;
339}
340
341static irqreturn_t lis302dl_interrupt_thread2_8b(int irq, void *data)
342{
343
344 struct lis3lv02d *lis3 = data;
345
346 if ((lis3->pdata->irq_cfg & LIS3_IRQ2_MASK) == LIS3_IRQ2_CLICK)
347 lis302dl_interrupt_handle_click(lis3);
348 else
349 lis302dl_interrupt_handle_ff_wu(lis3);
350
351 return IRQ_HANDLED;
352}
353
354static int lis3lv02d_misc_open(struct inode *inode, struct file *file)
355{
356 if (test_and_set_bit(0, &lis3_dev.misc_opened))
357 return -EBUSY; /* already open */
358
359 atomic_set(&lis3_dev.count, 0);
294 return 0; 360 return 0;
295} 361}
296 362
297static int lis3lv02d_misc_release(struct inode *inode, struct file *file) 363static int lis3lv02d_misc_release(struct inode *inode, struct file *file)
298{ 364{
299 fasync_helper(-1, file, 0, &lis3_dev.async_queue); 365 fasync_helper(-1, file, 0, &lis3_dev.async_queue);
300 free_irq(lis3_dev.irq, &lis3_dev);
301 clear_bit(0, &lis3_dev.misc_opened); /* release the device */ 366 clear_bit(0, &lis3_dev.misc_opened); /* release the device */
302 return 0; 367 return 0;
303} 368}
@@ -380,22 +445,12 @@ static struct miscdevice lis3lv02d_misc_device = {
380 .fops = &lis3lv02d_misc_fops, 445 .fops = &lis3lv02d_misc_fops,
381}; 446};
382 447
383static void lis3lv02d_joystick_poll(struct input_polled_dev *pidev)
384{
385 int x, y, z;
386
387 lis3lv02d_get_xyz(&lis3_dev, &x, &y, &z);
388 input_report_abs(pidev->input, ABS_X, x);
389 input_report_abs(pidev->input, ABS_Y, y);
390 input_report_abs(pidev->input, ABS_Z, z);
391 input_sync(pidev->input);
392}
393
394int lis3lv02d_joystick_enable(void) 448int lis3lv02d_joystick_enable(void)
395{ 449{
396 struct input_dev *input_dev; 450 struct input_dev *input_dev;
397 int err; 451 int err;
398 int max_val, fuzz, flat; 452 int max_val, fuzz, flat;
453 int btns[] = {BTN_X, BTN_Y, BTN_Z};
399 454
400 if (lis3_dev.idev) 455 if (lis3_dev.idev)
401 return -EINVAL; 456 return -EINVAL;
@@ -406,6 +461,8 @@ int lis3lv02d_joystick_enable(void)
406 461
407 lis3_dev.idev->poll = lis3lv02d_joystick_poll; 462 lis3_dev.idev->poll = lis3lv02d_joystick_poll;
408 lis3_dev.idev->poll_interval = MDPS_POLL_INTERVAL; 463 lis3_dev.idev->poll_interval = MDPS_POLL_INTERVAL;
464 lis3_dev.idev->poll_interval_min = MDPS_POLL_MIN;
465 lis3_dev.idev->poll_interval_max = MDPS_POLL_MAX;
409 input_dev = lis3_dev.idev->input; 466 input_dev = lis3_dev.idev->input;
410 467
411 input_dev->name = "ST LIS3LV02DL Accelerometer"; 468 input_dev->name = "ST LIS3LV02DL Accelerometer";
@@ -422,6 +479,10 @@ int lis3lv02d_joystick_enable(void)
422 input_set_abs_params(input_dev, ABS_Y, -max_val, max_val, fuzz, flat); 479 input_set_abs_params(input_dev, ABS_Y, -max_val, max_val, fuzz, flat);
423 input_set_abs_params(input_dev, ABS_Z, -max_val, max_val, fuzz, flat); 480 input_set_abs_params(input_dev, ABS_Z, -max_val, max_val, fuzz, flat);
424 481
482 lis3_dev.mapped_btns[0] = lis3lv02d_get_axis(abs(lis3_dev.ac.x), btns);
483 lis3_dev.mapped_btns[1] = lis3lv02d_get_axis(abs(lis3_dev.ac.y), btns);
484 lis3_dev.mapped_btns[2] = lis3lv02d_get_axis(abs(lis3_dev.ac.z), btns);
485
425 err = input_register_polled_device(lis3_dev.idev); 486 err = input_register_polled_device(lis3_dev.idev);
426 if (err) { 487 if (err) {
427 input_free_polled_device(lis3_dev.idev); 488 input_free_polled_device(lis3_dev.idev);
@@ -434,6 +495,11 @@ EXPORT_SYMBOL_GPL(lis3lv02d_joystick_enable);
434 495
435void lis3lv02d_joystick_disable(void) 496void lis3lv02d_joystick_disable(void)
436{ 497{
498 if (lis3_dev.irq)
499 free_irq(lis3_dev.irq, &lis3_dev);
500 if (lis3_dev.pdata && lis3_dev.pdata->irq2)
501 free_irq(lis3_dev.pdata->irq2, &lis3_dev);
502
437 if (!lis3_dev.idev) 503 if (!lis3_dev.idev)
438 return; 504 return;
439 505
@@ -462,7 +528,9 @@ static ssize_t lis3lv02d_position_show(struct device *dev,
462{ 528{
463 int x, y, z; 529 int x, y, z;
464 530
531 mutex_lock(&lis3_dev.mutex);
465 lis3lv02d_get_xyz(&lis3_dev, &x, &y, &z); 532 lis3lv02d_get_xyz(&lis3_dev, &x, &y, &z);
533 mutex_unlock(&lis3_dev.mutex);
466 return sprintf(buf, "(%d,%d,%d)\n", x, y, z); 534 return sprintf(buf, "(%d,%d,%d)\n", x, y, z);
467} 535}
468 536
@@ -521,12 +589,70 @@ int lis3lv02d_remove_fs(struct lis3lv02d *lis3)
521} 589}
522EXPORT_SYMBOL_GPL(lis3lv02d_remove_fs); 590EXPORT_SYMBOL_GPL(lis3lv02d_remove_fs);
523 591
592static void lis3lv02d_8b_configure(struct lis3lv02d *dev,
593 struct lis3lv02d_platform_data *p)
594{
595 int err;
596 int ctrl2 = p->hipass_ctrl;
597
598 if (p->click_flags) {
599 dev->write(dev, CLICK_CFG, p->click_flags);
600 dev->write(dev, CLICK_TIMELIMIT, p->click_time_limit);
601 dev->write(dev, CLICK_LATENCY, p->click_latency);
602 dev->write(dev, CLICK_WINDOW, p->click_window);
603 dev->write(dev, CLICK_THSZ, p->click_thresh_z & 0xf);
604 dev->write(dev, CLICK_THSY_X,
605 (p->click_thresh_x & 0xf) |
606 (p->click_thresh_y << 4));
607
608 if (dev->idev) {
609 struct input_dev *input_dev = lis3_dev.idev->input;
610 input_set_capability(input_dev, EV_KEY, BTN_X);
611 input_set_capability(input_dev, EV_KEY, BTN_Y);
612 input_set_capability(input_dev, EV_KEY, BTN_Z);
613 }
614 }
615
616 if (p->wakeup_flags) {
617 dev->write(dev, FF_WU_CFG_1, p->wakeup_flags);
618 dev->write(dev, FF_WU_THS_1, p->wakeup_thresh & 0x7f);
619 /* default to 2.5ms for now */
620 dev->write(dev, FF_WU_DURATION_1, 1);
621 ctrl2 ^= HP_FF_WU1; /* Xor to keep compatible with old pdata*/
622 }
623
624 if (p->wakeup_flags2) {
625 dev->write(dev, FF_WU_CFG_2, p->wakeup_flags2);
626 dev->write(dev, FF_WU_THS_2, p->wakeup_thresh2 & 0x7f);
627 /* default to 2.5ms for now */
628 dev->write(dev, FF_WU_DURATION_2, 1);
629 ctrl2 ^= HP_FF_WU2; /* Xor to keep compatible with old pdata*/
630 }
631 /* Configure hipass filters */
632 dev->write(dev, CTRL_REG2, ctrl2);
633
634 if (p->irq2) {
635 err = request_threaded_irq(p->irq2,
636 NULL,
637 lis302dl_interrupt_thread2_8b,
638 IRQF_TRIGGER_RISING |
639 IRQF_ONESHOT,
640 DRIVER_NAME, &lis3_dev);
641 if (err < 0)
642 printk(KERN_ERR DRIVER_NAME
643 "No second IRQ. Limited functionality\n");
644 }
645}
646
524/* 647/*
525 * Initialise the accelerometer and the various subsystems. 648 * Initialise the accelerometer and the various subsystems.
526 * Should be rather independent of the bus system. 649 * Should be rather independent of the bus system.
527 */ 650 */
528int lis3lv02d_init_device(struct lis3lv02d *dev) 651int lis3lv02d_init_device(struct lis3lv02d *dev)
529{ 652{
653 int err;
654 irq_handler_t thread_fn;
655
530 dev->whoami = lis3lv02d_read_8(dev, WHO_AM_I); 656 dev->whoami = lis3lv02d_read_8(dev, WHO_AM_I);
531 657
532 switch (dev->whoami) { 658 switch (dev->whoami) {
@@ -567,25 +693,8 @@ int lis3lv02d_init_device(struct lis3lv02d *dev)
567 if (dev->pdata) { 693 if (dev->pdata) {
568 struct lis3lv02d_platform_data *p = dev->pdata; 694 struct lis3lv02d_platform_data *p = dev->pdata;
569 695
570 if (p->click_flags && (dev->whoami == WAI_8B)) { 696 if (dev->whoami == WAI_8B)
571 dev->write(dev, CLICK_CFG, p->click_flags); 697 lis3lv02d_8b_configure(dev, p);
572 dev->write(dev, CLICK_TIMELIMIT, p->click_time_limit);
573 dev->write(dev, CLICK_LATENCY, p->click_latency);
574 dev->write(dev, CLICK_WINDOW, p->click_window);
575 dev->write(dev, CLICK_THSZ, p->click_thresh_z & 0xf);
576 dev->write(dev, CLICK_THSY_X,
577 (p->click_thresh_x & 0xf) |
578 (p->click_thresh_y << 4));
579 }
580
581 if (p->wakeup_flags && (dev->whoami == WAI_8B)) {
582 dev->write(dev, FF_WU_CFG_1, p->wakeup_flags);
583 dev->write(dev, FF_WU_THS_1, p->wakeup_thresh & 0x7f);
584 /* default to 2.5ms for now */
585 dev->write(dev, FF_WU_DURATION_1, 1);
586 /* enable high pass filter for both free-fall units */
587 dev->write(dev, CTRL_REG2, HP_FF_WU1 | HP_FF_WU2);
588 }
589 698
590 if (p->irq_cfg) 699 if (p->irq_cfg)
591 dev->write(dev, CTRL_REG3, p->irq_cfg); 700 dev->write(dev, CTRL_REG3, p->irq_cfg);
@@ -598,6 +707,32 @@ int lis3lv02d_init_device(struct lis3lv02d *dev)
598 goto out; 707 goto out;
599 } 708 }
600 709
710 /*
711 * The sensor can generate interrupts for free-fall and direction
712 * detection (distinguishable with FF_WU_SRC and DD_SRC) but to keep
713 * the things simple and _fast_ we activate it only for free-fall, so
714 * no need to read register (very slow with ACPI). For the same reason,
715 * we forbid shared interrupts.
716 *
717 * IRQF_TRIGGER_RISING seems pointless on HP laptops because the
718 * io-apic is not configurable (and generates a warning) but I keep it
719 * in case of support for other hardware.
720 */
721 if (dev->whoami == WAI_8B)
722 thread_fn = lis302dl_interrupt_thread1_8b;
723 else
724 thread_fn = NULL;
725
726 err = request_threaded_irq(dev->irq, lis302dl_interrupt,
727 thread_fn,
728 IRQF_TRIGGER_RISING | IRQF_ONESHOT,
729 DRIVER_NAME, &lis3_dev);
730
731 if (err < 0) {
732 printk(KERN_ERR DRIVER_NAME "Cannot get IRQ\n");
733 goto out;
734 }
735
601 if (misc_register(&lis3lv02d_misc_device)) 736 if (misc_register(&lis3lv02d_misc_device))
602 printk(KERN_ERR DRIVER_NAME ": misc_register failed\n"); 737 printk(KERN_ERR DRIVER_NAME ": misc_register failed\n");
603out: 738out:
diff --git a/drivers/hwmon/lis3lv02d.h b/drivers/hwmon/lis3lv02d.h
index e6a01f44709b..854091380e33 100644
--- a/drivers/hwmon/lis3lv02d.h
+++ b/drivers/hwmon/lis3lv02d.h
@@ -196,6 +196,16 @@ enum lis3lv02d_dd_src {
196 DD_SRC_IA = 0x40, 196 DD_SRC_IA = 0x40,
197}; 197};
198 198
199enum lis3lv02d_click_src_8b {
200 CLICK_SINGLE_X = 0x01,
201 CLICK_DOUBLE_X = 0x02,
202 CLICK_SINGLE_Y = 0x04,
203 CLICK_DOUBLE_Y = 0x08,
204 CLICK_SINGLE_Z = 0x10,
205 CLICK_DOUBLE_Z = 0x20,
206 CLICK_IA = 0x40,
207};
208
199struct axis_conversion { 209struct axis_conversion {
200 s8 x; 210 s8 x;
201 s8 y; 211 s8 y;
@@ -223,6 +233,7 @@ struct lis3lv02d {
223 struct platform_device *pdev; /* platform device */ 233 struct platform_device *pdev; /* platform device */
224 atomic_t count; /* interrupt count after last read */ 234 atomic_t count; /* interrupt count after last read */
225 struct axis_conversion ac; /* hw -> logical axis */ 235 struct axis_conversion ac; /* hw -> logical axis */
236 int mapped_btns[3];
226 237
227 u32 irq; /* IRQ number */ 238 u32 irq; /* IRQ number */
228 struct fasync_struct *async_queue; /* queue for the misc device */ 239 struct fasync_struct *async_queue; /* queue for the misc device */
diff --git a/drivers/isdn/gigaset/capi.c b/drivers/isdn/gigaset/capi.c
index 964a55fb1486..ac4cfeed3946 100644
--- a/drivers/isdn/gigaset/capi.c
+++ b/drivers/isdn/gigaset/capi.c
@@ -170,17 +170,6 @@ static inline void ignore_cstruct_param(struct cardstate *cs, _cstruct param,
170} 170}
171 171
172/* 172/*
173 * convert hex to binary
174 */
175static inline u8 hex2bin(char c)
176{
177 int result = c & 0x0f;
178 if (c & 0x40)
179 result += 9;
180 return result;
181}
182
183/*
184 * convert an IE from Gigaset hex string to ETSI binary representation 173 * convert an IE from Gigaset hex string to ETSI binary representation
185 * including length byte 174 * including length byte
186 * return value: result length, -1 on error 175 * return value: result length, -1 on error
@@ -191,7 +180,7 @@ static int encode_ie(char *in, u8 *out, int maxlen)
191 while (*in) { 180 while (*in) {
192 if (!isxdigit(in[0]) || !isxdigit(in[1]) || l >= maxlen) 181 if (!isxdigit(in[0]) || !isxdigit(in[1]) || l >= maxlen)
193 return -1; 182 return -1;
194 out[++l] = (hex2bin(in[0]) << 4) + hex2bin(in[1]); 183 out[++l] = (hex_to_bin(in[0]) << 4) + hex_to_bin(in[1]);
195 in += 2; 184 in += 2;
196 } 185 }
197 out[0] = l; 186 out[0] = l;
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 0d0d625fece2..26386a92f5aa 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -14,11 +14,17 @@ menuconfig MISC_DEVICES
14if MISC_DEVICES 14if MISC_DEVICES
15 15
16config AD525X_DPOT 16config AD525X_DPOT
17 tristate "Analog Devices AD525x Digital Potentiometers" 17 tristate "Analog Devices Digital Potentiometers"
18 depends on I2C && SYSFS 18 depends on (I2C || SPI) && SYSFS
19 help 19 help
20 If you say yes here, you get support for the Analog Devices 20 If you say yes here, you get support for the Analog Devices
21 AD5258, AD5259, AD5251, AD5252, AD5253, AD5254 and AD5255 21 AD5258, AD5259, AD5251, AD5252, AD5253, AD5254, AD5255
22 AD5160, AD5161, AD5162, AD5165, AD5200, AD5201, AD5203,
23 AD5204, AD5206, AD5207, AD5231, AD5232, AD5233, AD5235,
24 AD5260, AD5262, AD5263, AD5290, AD5291, AD5292, AD5293,
25 AD7376, AD8400, AD8402, AD8403, ADN2850, AD5241, AD5242,
26 AD5243, AD5245, AD5246, AD5247, AD5248, AD5280, AD5282,
27 ADN2860, AD5273, AD5171, AD5170, AD5172, AD5173
22 digital potentiometer chips. 28 digital potentiometer chips.
23 29
24 See Documentation/misc-devices/ad525x_dpot.txt for the 30 See Documentation/misc-devices/ad525x_dpot.txt for the
@@ -27,6 +33,26 @@ config AD525X_DPOT
27 This driver can also be built as a module. If so, the module 33 This driver can also be built as a module. If so, the module
28 will be called ad525x_dpot. 34 will be called ad525x_dpot.
29 35
36config AD525X_DPOT_I2C
37 tristate "support I2C bus connection"
38 depends on AD525X_DPOT && I2C
39 help
40 Say Y here if you have a digital potentiometers hooked to an I2C bus.
41
42 To compile this driver as a module, choose M here: the
43 module will be called ad525x_dpot-i2c.
44
45config AD525X_DPOT_SPI
46 tristate "support SPI bus connection"
47 depends on AD525X_DPOT && SPI_MASTER
48 help
49 Say Y here if you have a digital potentiometers hooked to an SPI bus.
50
51 If unsure, say N (but it's safe to say "Y").
52
53 To compile this driver as a module, choose M here: the
54 module will be called ad525x_dpot-spi.
55
30config ATMEL_PWM 56config ATMEL_PWM
31 tristate "Atmel AT32/AT91 PWM support" 57 tristate "Atmel AT32/AT91 PWM support"
32 depends on AVR32 || ARCH_AT91SAM9263 || ARCH_AT91SAM9RL || ARCH_AT91CAP9 58 depends on AVR32 || ARCH_AT91SAM9263 || ARCH_AT91SAM9RL || ARCH_AT91CAP9
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index f12dc3e54402..6ed06a19474a 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -4,6 +4,8 @@
4 4
5obj-$(CONFIG_IBM_ASM) += ibmasm/ 5obj-$(CONFIG_IBM_ASM) += ibmasm/
6obj-$(CONFIG_AD525X_DPOT) += ad525x_dpot.o 6obj-$(CONFIG_AD525X_DPOT) += ad525x_dpot.o
7obj-$(CONFIG_AD525X_DPOT_I2C) += ad525x_dpot-i2c.o
8obj-$(CONFIG_AD525X_DPOT_SPI) += ad525x_dpot-spi.o
7obj-$(CONFIG_ATMEL_PWM) += atmel_pwm.o 9obj-$(CONFIG_ATMEL_PWM) += atmel_pwm.o
8obj-$(CONFIG_ATMEL_SSC) += atmel-ssc.o 10obj-$(CONFIG_ATMEL_SSC) += atmel-ssc.o
9obj-$(CONFIG_ATMEL_TCLIB) += atmel_tclib.o 11obj-$(CONFIG_ATMEL_TCLIB) += atmel_tclib.o
diff --git a/drivers/misc/ad525x_dpot-i2c.c b/drivers/misc/ad525x_dpot-i2c.c
new file mode 100644
index 000000000000..374352af7979
--- /dev/null
+++ b/drivers/misc/ad525x_dpot-i2c.c
@@ -0,0 +1,134 @@
1/*
2 * Driver for the Analog Devices digital potentiometers (I2C bus)
3 *
4 * Copyright (C) 2010 Michael Hennerich, Analog Devices Inc.
5 *
6 * Licensed under the GPL-2 or later.
7 */
8
9#include <linux/i2c.h>
10#include <linux/module.h>
11
12#include "ad525x_dpot.h"
13
14/* ------------------------------------------------------------------------- */
15/* I2C bus functions */
16static int write_d8(void *client, u8 val)
17{
18 return i2c_smbus_write_byte(client, val);
19}
20
21static int write_r8d8(void *client, u8 reg, u8 val)
22{
23 return i2c_smbus_write_byte_data(client, reg, val);
24}
25
26static int write_r8d16(void *client, u8 reg, u16 val)
27{
28 return i2c_smbus_write_word_data(client, reg, val);
29}
30
31static int read_d8(void *client)
32{
33 return i2c_smbus_read_byte(client);
34}
35
36static int read_r8d8(void *client, u8 reg)
37{
38 return i2c_smbus_read_byte_data(client, reg);
39}
40
41static int read_r8d16(void *client, u8 reg)
42{
43 return i2c_smbus_read_word_data(client, reg);
44}
45
46static const struct ad_dpot_bus_ops bops = {
47 .read_d8 = read_d8,
48 .read_r8d8 = read_r8d8,
49 .read_r8d16 = read_r8d16,
50 .write_d8 = write_d8,
51 .write_r8d8 = write_r8d8,
52 .write_r8d16 = write_r8d16,
53};
54
55static int __devinit ad_dpot_i2c_probe(struct i2c_client *client,
56 const struct i2c_device_id *id)
57{
58 struct ad_dpot_bus_data bdata = {
59 .client = client,
60 .bops = &bops,
61 };
62
63 struct ad_dpot_id dpot_id = {
64 .name = (char *) &id->name,
65 .devid = id->driver_data,
66 };
67
68 if (!i2c_check_functionality(client->adapter,
69 I2C_FUNC_SMBUS_WORD_DATA)) {
70 dev_err(&client->dev, "SMBUS Word Data not Supported\n");
71 return -EIO;
72 }
73
74 return ad_dpot_probe(&client->dev, &bdata, &dpot_id);
75}
76
77static int __devexit ad_dpot_i2c_remove(struct i2c_client *client)
78{
79 return ad_dpot_remove(&client->dev);
80}
81
82static const struct i2c_device_id ad_dpot_id[] = {
83 {"ad5258", AD5258_ID},
84 {"ad5259", AD5259_ID},
85 {"ad5251", AD5251_ID},
86 {"ad5252", AD5252_ID},
87 {"ad5253", AD5253_ID},
88 {"ad5254", AD5254_ID},
89 {"ad5255", AD5255_ID},
90 {"ad5241", AD5241_ID},
91 {"ad5242", AD5242_ID},
92 {"ad5243", AD5243_ID},
93 {"ad5245", AD5245_ID},
94 {"ad5246", AD5246_ID},
95 {"ad5247", AD5247_ID},
96 {"ad5248", AD5248_ID},
97 {"ad5280", AD5280_ID},
98 {"ad5282", AD5282_ID},
99 {"adn2860", ADN2860_ID},
100 {"ad5273", AD5273_ID},
101 {"ad5171", AD5171_ID},
102 {"ad5170", AD5170_ID},
103 {"ad5172", AD5172_ID},
104 {"ad5173", AD5173_ID},
105 {}
106};
107MODULE_DEVICE_TABLE(i2c, ad_dpot_id);
108
109static struct i2c_driver ad_dpot_i2c_driver = {
110 .driver = {
111 .name = "ad_dpot",
112 .owner = THIS_MODULE,
113 },
114 .probe = ad_dpot_i2c_probe,
115 .remove = __devexit_p(ad_dpot_i2c_remove),
116 .id_table = ad_dpot_id,
117};
118
119static int __init ad_dpot_i2c_init(void)
120{
121 return i2c_add_driver(&ad_dpot_i2c_driver);
122}
123module_init(ad_dpot_i2c_init);
124
125static void __exit ad_dpot_i2c_exit(void)
126{
127 i2c_del_driver(&ad_dpot_i2c_driver);
128}
129module_exit(ad_dpot_i2c_exit);
130
131MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
132MODULE_DESCRIPTION("digital potentiometer I2C bus driver");
133MODULE_LICENSE("GPL");
134MODULE_ALIAS("i2c:ad_dpot");
diff --git a/drivers/misc/ad525x_dpot-spi.c b/drivers/misc/ad525x_dpot-spi.c
new file mode 100644
index 000000000000..b8c6df9c8437
--- /dev/null
+++ b/drivers/misc/ad525x_dpot-spi.c
@@ -0,0 +1,172 @@
1/*
2 * Driver for the Analog Devices digital potentiometers (SPI bus)
3 *
4 * Copyright (C) 2010 Michael Hennerich, Analog Devices Inc.
5 *
6 * Licensed under the GPL-2 or later.
7 */
8
9#include <linux/spi/spi.h>
10#include <linux/module.h>
11
12#include "ad525x_dpot.h"
13
14static const struct ad_dpot_id ad_dpot_spi_devlist[] = {
15 {.name = "ad5160", .devid = AD5160_ID},
16 {.name = "ad5161", .devid = AD5161_ID},
17 {.name = "ad5162", .devid = AD5162_ID},
18 {.name = "ad5165", .devid = AD5165_ID},
19 {.name = "ad5200", .devid = AD5200_ID},
20 {.name = "ad5201", .devid = AD5201_ID},
21 {.name = "ad5203", .devid = AD5203_ID},
22 {.name = "ad5204", .devid = AD5204_ID},
23 {.name = "ad5206", .devid = AD5206_ID},
24 {.name = "ad5207", .devid = AD5207_ID},
25 {.name = "ad5231", .devid = AD5231_ID},
26 {.name = "ad5232", .devid = AD5232_ID},
27 {.name = "ad5233", .devid = AD5233_ID},
28 {.name = "ad5235", .devid = AD5235_ID},
29 {.name = "ad5260", .devid = AD5260_ID},
30 {.name = "ad5262", .devid = AD5262_ID},
31 {.name = "ad5263", .devid = AD5263_ID},
32 {.name = "ad5290", .devid = AD5290_ID},
33 {.name = "ad5291", .devid = AD5291_ID},
34 {.name = "ad5292", .devid = AD5292_ID},
35 {.name = "ad5293", .devid = AD5293_ID},
36 {.name = "ad7376", .devid = AD7376_ID},
37 {.name = "ad8400", .devid = AD8400_ID},
38 {.name = "ad8402", .devid = AD8402_ID},
39 {.name = "ad8403", .devid = AD8403_ID},
40 {.name = "adn2850", .devid = ADN2850_ID},
41 {}
42};
43
44/* ------------------------------------------------------------------------- */
45
46/* SPI bus functions */
47static int write8(void *client, u8 val)
48{
49 u8 data = val;
50 return spi_write(client, &data, 1);
51}
52
53static int write16(void *client, u8 reg, u8 val)
54{
55 u8 data[2] = {reg, val};
56 return spi_write(client, data, 1);
57}
58
59static int write24(void *client, u8 reg, u16 val)
60{
61 u8 data[3] = {reg, val >> 8, val};
62 return spi_write(client, data, 1);
63}
64
65static int read8(void *client)
66{
67 int ret;
68 u8 data;
69 ret = spi_read(client, &data, 1);
70 if (ret < 0)
71 return ret;
72
73 return data;
74}
75
76static int read16(void *client, u8 reg)
77{
78 int ret;
79 u8 buf_rx[2];
80
81 write16(client, reg, 0);
82 ret = spi_read(client, buf_rx, 2);
83 if (ret < 0)
84 return ret;
85
86 return (buf_rx[0] << 8) | buf_rx[1];
87}
88
89static int read24(void *client, u8 reg)
90{
91 int ret;
92 u8 buf_rx[3];
93
94 write24(client, reg, 0);
95 ret = spi_read(client, buf_rx, 3);
96 if (ret < 0)
97 return ret;
98
99 return (buf_rx[1] << 8) | buf_rx[2];
100}
101
102static const struct ad_dpot_bus_ops bops = {
103 .read_d8 = read8,
104 .read_r8d8 = read16,
105 .read_r8d16 = read24,
106 .write_d8 = write8,
107 .write_r8d8 = write16,
108 .write_r8d16 = write24,
109};
110
111static const struct ad_dpot_id *dpot_match_id(const struct ad_dpot_id *id,
112 char *name)
113{
114 while (id->name && id->name[0]) {
115 if (strcmp(name, id->name) == 0)
116 return id;
117 id++;
118 }
119 return NULL;
120}
121
122static int __devinit ad_dpot_spi_probe(struct spi_device *spi)
123{
124 char *name = spi->dev.platform_data;
125 const struct ad_dpot_id *dpot_id;
126
127 struct ad_dpot_bus_data bdata = {
128 .client = spi,
129 .bops = &bops,
130 };
131
132 dpot_id = dpot_match_id(ad_dpot_spi_devlist, name);
133
134 if (dpot_id == NULL) {
135 dev_err(&spi->dev, "%s not in supported device list", name);
136 return -ENODEV;
137 }
138
139 return ad_dpot_probe(&spi->dev, &bdata, dpot_id);
140}
141
142static int __devexit ad_dpot_spi_remove(struct spi_device *spi)
143{
144 return ad_dpot_remove(&spi->dev);
145}
146
147static struct spi_driver ad_dpot_spi_driver = {
148 .driver = {
149 .name = "ad_dpot",
150 .bus = &spi_bus_type,
151 .owner = THIS_MODULE,
152 },
153 .probe = ad_dpot_spi_probe,
154 .remove = __devexit_p(ad_dpot_spi_remove),
155};
156
157static int __init ad_dpot_spi_init(void)
158{
159 return spi_register_driver(&ad_dpot_spi_driver);
160}
161module_init(ad_dpot_spi_init);
162
163static void __exit ad_dpot_spi_exit(void)
164{
165 spi_unregister_driver(&ad_dpot_spi_driver);
166}
167module_exit(ad_dpot_spi_exit);
168
169MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
170MODULE_DESCRIPTION("digital potentiometer SPI bus driver");
171MODULE_LICENSE("GPL");
172MODULE_ALIAS("spi:ad_dpot");
diff --git a/drivers/misc/ad525x_dpot.c b/drivers/misc/ad525x_dpot.c
index 30a59f2bacd2..5e6fa8449e8b 100644
--- a/drivers/misc/ad525x_dpot.c
+++ b/drivers/misc/ad525x_dpot.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * ad525x_dpot: Driver for the Analog Devices AD525x digital potentiometers 2 * ad525x_dpot: Driver for the Analog Devices digital potentiometers
3 * Copyright (c) 2009 Analog Devices, Inc. 3 * Copyright (c) 2009-2010 Analog Devices, Inc.
4 * Author: Michael Hennerich <hennerich@blackfin.uclinux.org> 4 * Author: Michael Hennerich <hennerich@blackfin.uclinux.org>
5 * 5 *
6 * DEVID #Wipers #Positions Resistor Options (kOhm) 6 * DEVID #Wipers #Positions Resistor Options (kOhm)
@@ -11,6 +11,47 @@
11 * AD5255 3 512 25, 250 11 * AD5255 3 512 25, 250
12 * AD5253 4 64 1, 10, 50, 100 12 * AD5253 4 64 1, 10, 50, 100
13 * AD5254 4 256 1, 10, 50, 100 13 * AD5254 4 256 1, 10, 50, 100
14 * AD5160 1 256 5, 10, 50, 100
15 * AD5161 1 256 5, 10, 50, 100
16 * AD5162 2 256 2.5, 10, 50, 100
17 * AD5165 1 256 100
18 * AD5200 1 256 10, 50
19 * AD5201 1 33 10, 50
20 * AD5203 4 64 10, 100
21 * AD5204 4 256 10, 50, 100
22 * AD5206 6 256 10, 50, 100
23 * AD5207 2 256 10, 50, 100
24 * AD5231 1 1024 10, 50, 100
25 * AD5232 2 256 10, 50, 100
26 * AD5233 4 64 10, 50, 100
27 * AD5235 2 1024 25, 250
28 * AD5260 1 256 20, 50, 200
29 * AD5262 2 256 20, 50, 200
30 * AD5263 4 256 20, 50, 200
31 * AD5290 1 256 10, 50, 100
32 * AD5291 1 256 20
33 * AD5292 1 1024 20
34 * AD5293 1 1024 20
35 * AD7376 1 128 10, 50, 100, 1M
36 * AD8400 1 256 1, 10, 50, 100
37 * AD8402 2 256 1, 10, 50, 100
38 * AD8403 4 256 1, 10, 50, 100
39 * ADN2850 3 512 25, 250
40 * AD5241 1 256 10, 100, 1M
41 * AD5246 1 128 5, 10, 50, 100
42 * AD5247 1 128 5, 10, 50, 100
43 * AD5245 1 256 5, 10, 50, 100
44 * AD5243 2 256 2.5, 10, 50, 100
45 * AD5248 2 256 2.5, 10, 50, 100
46 * AD5242 2 256 20, 50, 200
47 * AD5280 1 256 20, 50, 200
48 * AD5282 2 256 20, 50, 200
49 * ADN2860 3 512 25, 250
50 * AD5273 1 64 1, 10, 50, 100 (OTP)
51 * AD5171 1 64 5, 10, 50, 100 (OTP)
52 * AD5170 1 256 2.5, 10, 50, 100 (OTP)
53 * AD5172 2 256 2.5, 10, 50, 100 (OTP)
54 * AD5173 2 256 2.5, 10, 50, 100 (OTP)
14 * 55 *
15 * See Documentation/misc-devices/ad525x_dpot.txt for more info. 56 * See Documentation/misc-devices/ad525x_dpot.txt for more info.
16 * 57 *
@@ -28,77 +69,283 @@
28#include <linux/device.h> 69#include <linux/device.h>
29#include <linux/kernel.h> 70#include <linux/kernel.h>
30#include <linux/init.h> 71#include <linux/init.h>
31#include <linux/slab.h>
32#include <linux/i2c.h>
33#include <linux/delay.h> 72#include <linux/delay.h>
73#include <linux/slab.h>
34 74
35#define DRIVER_NAME "ad525x_dpot" 75#define DRIVER_VERSION "0.2"
36#define DRIVER_VERSION "0.1"
37
38enum dpot_devid {
39 AD5258_ID,
40 AD5259_ID,
41 AD5251_ID,
42 AD5252_ID,
43 AD5253_ID,
44 AD5254_ID,
45 AD5255_ID,
46};
47 76
48#define AD5258_MAX_POSITION 64 77#include "ad525x_dpot.h"
49#define AD5259_MAX_POSITION 256
50#define AD5251_MAX_POSITION 64
51#define AD5252_MAX_POSITION 256
52#define AD5253_MAX_POSITION 64
53#define AD5254_MAX_POSITION 256
54#define AD5255_MAX_POSITION 512
55
56#define AD525X_RDAC0 0
57#define AD525X_RDAC1 1
58#define AD525X_RDAC2 2
59#define AD525X_RDAC3 3
60
61#define AD525X_REG_TOL 0x18
62#define AD525X_TOL_RDAC0 (AD525X_REG_TOL | AD525X_RDAC0)
63#define AD525X_TOL_RDAC1 (AD525X_REG_TOL | AD525X_RDAC1)
64#define AD525X_TOL_RDAC2 (AD525X_REG_TOL | AD525X_RDAC2)
65#define AD525X_TOL_RDAC3 (AD525X_REG_TOL | AD525X_RDAC3)
66
67/* RDAC-to-EEPROM Interface Commands */
68#define AD525X_I2C_RDAC (0x00 << 5)
69#define AD525X_I2C_EEPROM (0x01 << 5)
70#define AD525X_I2C_CMD (0x80)
71
72#define AD525X_DEC_ALL_6DB (AD525X_I2C_CMD | (0x4 << 3))
73#define AD525X_INC_ALL_6DB (AD525X_I2C_CMD | (0x9 << 3))
74#define AD525X_DEC_ALL (AD525X_I2C_CMD | (0x6 << 3))
75#define AD525X_INC_ALL (AD525X_I2C_CMD | (0xB << 3))
76
77static s32 ad525x_read(struct i2c_client *client, u8 reg);
78static s32 ad525x_write(struct i2c_client *client, u8 reg, u8 value);
79 78
80/* 79/*
81 * Client data (each client gets its own) 80 * Client data (each client gets its own)
82 */ 81 */
83 82
84struct dpot_data { 83struct dpot_data {
84 struct ad_dpot_bus_data bdata;
85 struct mutex update_lock; 85 struct mutex update_lock;
86 unsigned rdac_mask; 86 unsigned rdac_mask;
87 unsigned max_pos; 87 unsigned max_pos;
88 unsigned devid; 88 unsigned long devid;
89 unsigned uid;
90 unsigned feat;
91 unsigned wipers;
92 u16 rdac_cache[MAX_RDACS];
93 DECLARE_BITMAP(otp_en_mask, MAX_RDACS);
89}; 94};
90 95
96static inline int dpot_read_d8(struct dpot_data *dpot)
97{
98 return dpot->bdata.bops->read_d8(dpot->bdata.client);
99}
100
101static inline int dpot_read_r8d8(struct dpot_data *dpot, u8 reg)
102{
103 return dpot->bdata.bops->read_r8d8(dpot->bdata.client, reg);
104}
105
106static inline int dpot_read_r8d16(struct dpot_data *dpot, u8 reg)
107{
108 return dpot->bdata.bops->read_r8d16(dpot->bdata.client, reg);
109}
110
111static inline int dpot_write_d8(struct dpot_data *dpot, u8 val)
112{
113 return dpot->bdata.bops->write_d8(dpot->bdata.client, val);
114}
115
116static inline int dpot_write_r8d8(struct dpot_data *dpot, u8 reg, u16 val)
117{
118 return dpot->bdata.bops->write_r8d8(dpot->bdata.client, reg, val);
119}
120
121static inline int dpot_write_r8d16(struct dpot_data *dpot, u8 reg, u16 val)
122{
123 return dpot->bdata.bops->write_r8d16(dpot->bdata.client, reg, val);
124}
125
126static s32 dpot_read_spi(struct dpot_data *dpot, u8 reg)
127{
128 unsigned ctrl = 0;
129
130 if (!(reg & (DPOT_ADDR_EEPROM | DPOT_ADDR_CMD))) {
131
132 if (dpot->feat & F_RDACS_WONLY)
133 return dpot->rdac_cache[reg & DPOT_RDAC_MASK];
134
135 if (dpot->uid == DPOT_UID(AD5291_ID) ||
136 dpot->uid == DPOT_UID(AD5292_ID) ||
137 dpot->uid == DPOT_UID(AD5293_ID))
138 return dpot_read_r8d8(dpot,
139 DPOT_AD5291_READ_RDAC << 2);
140
141 ctrl = DPOT_SPI_READ_RDAC;
142 } else if (reg & DPOT_ADDR_EEPROM) {
143 ctrl = DPOT_SPI_READ_EEPROM;
144 }
145
146 if (dpot->feat & F_SPI_16BIT)
147 return dpot_read_r8d8(dpot, ctrl);
148 else if (dpot->feat & F_SPI_24BIT)
149 return dpot_read_r8d16(dpot, ctrl);
150
151 return -EFAULT;
152}
153
154static s32 dpot_read_i2c(struct dpot_data *dpot, u8 reg)
155{
156 unsigned ctrl = 0;
157 switch (dpot->uid) {
158 case DPOT_UID(AD5246_ID):
159 case DPOT_UID(AD5247_ID):
160 return dpot_read_d8(dpot);
161 case DPOT_UID(AD5245_ID):
162 case DPOT_UID(AD5241_ID):
163 case DPOT_UID(AD5242_ID):
164 case DPOT_UID(AD5243_ID):
165 case DPOT_UID(AD5248_ID):
166 case DPOT_UID(AD5280_ID):
167 case DPOT_UID(AD5282_ID):
168 ctrl = ((reg & DPOT_RDAC_MASK) == DPOT_RDAC0) ?
169 0 : DPOT_AD5291_RDAC_AB;
170 return dpot_read_r8d8(dpot, ctrl);
171 case DPOT_UID(AD5170_ID):
172 case DPOT_UID(AD5171_ID):
173 case DPOT_UID(AD5273_ID):
174 return dpot_read_d8(dpot);
175 case DPOT_UID(AD5172_ID):
176 case DPOT_UID(AD5173_ID):
177 ctrl = ((reg & DPOT_RDAC_MASK) == DPOT_RDAC0) ?
178 0 : DPOT_AD5272_3_A0;
179 return dpot_read_r8d8(dpot, ctrl);
180 default:
181 if ((reg & DPOT_REG_TOL) || (dpot->max_pos > 256))
182 return dpot_read_r8d16(dpot, (reg & 0xF8) |
183 ((reg & 0x7) << 1));
184 else
185 return dpot_read_r8d8(dpot, reg);
186 }
187}
188
189static s32 dpot_read(struct dpot_data *dpot, u8 reg)
190{
191 if (dpot->feat & F_SPI)
192 return dpot_read_spi(dpot, reg);
193 else
194 return dpot_read_i2c(dpot, reg);
195}
196
197static s32 dpot_write_spi(struct dpot_data *dpot, u8 reg, u16 value)
198{
199 unsigned val = 0;
200
201 if (!(reg & (DPOT_ADDR_EEPROM | DPOT_ADDR_CMD))) {
202 if (dpot->feat & F_RDACS_WONLY)
203 dpot->rdac_cache[reg & DPOT_RDAC_MASK] = value;
204
205 if (dpot->feat & F_AD_APPDATA) {
206 if (dpot->feat & F_SPI_8BIT) {
207 val = ((reg & DPOT_RDAC_MASK) <<
208 DPOT_MAX_POS(dpot->devid)) |
209 value;
210 return dpot_write_d8(dpot, val);
211 } else if (dpot->feat & F_SPI_16BIT) {
212 val = ((reg & DPOT_RDAC_MASK) <<
213 DPOT_MAX_POS(dpot->devid)) |
214 value;
215 return dpot_write_r8d8(dpot, val >> 8,
216 val & 0xFF);
217 } else
218 BUG();
219 } else {
220 if (dpot->uid == DPOT_UID(AD5291_ID) ||
221 dpot->uid == DPOT_UID(AD5292_ID) ||
222 dpot->uid == DPOT_UID(AD5293_ID))
223 return dpot_write_r8d8(dpot,
224 (DPOT_AD5291_RDAC << 2) |
225 (value >> 8), value & 0xFF);
226
227 val = DPOT_SPI_RDAC | (reg & DPOT_RDAC_MASK);
228 }
229 } else if (reg & DPOT_ADDR_EEPROM) {
230 val = DPOT_SPI_EEPROM | (reg & DPOT_RDAC_MASK);
231 } else if (reg & DPOT_ADDR_CMD) {
232 switch (reg) {
233 case DPOT_DEC_ALL_6DB:
234 val = DPOT_SPI_DEC_ALL_6DB;
235 break;
236 case DPOT_INC_ALL_6DB:
237 val = DPOT_SPI_INC_ALL_6DB;
238 break;
239 case DPOT_DEC_ALL:
240 val = DPOT_SPI_DEC_ALL;
241 break;
242 case DPOT_INC_ALL:
243 val = DPOT_SPI_INC_ALL;
244 break;
245 }
246 } else
247 BUG();
248
249 if (dpot->feat & F_SPI_16BIT)
250 return dpot_write_r8d8(dpot, val, value);
251 else if (dpot->feat & F_SPI_24BIT)
252 return dpot_write_r8d16(dpot, val, value);
253
254 return -EFAULT;
255}
256
257static s32 dpot_write_i2c(struct dpot_data *dpot, u8 reg, u16 value)
258{
259 /* Only write the instruction byte for certain commands */
260 unsigned tmp = 0, ctrl = 0;
261
262 switch (dpot->uid) {
263 case DPOT_UID(AD5246_ID):
264 case DPOT_UID(AD5247_ID):
265 return dpot_write_d8(dpot, value);
266 break;
267
268 case DPOT_UID(AD5245_ID):
269 case DPOT_UID(AD5241_ID):
270 case DPOT_UID(AD5242_ID):
271 case DPOT_UID(AD5243_ID):
272 case DPOT_UID(AD5248_ID):
273 case DPOT_UID(AD5280_ID):
274 case DPOT_UID(AD5282_ID):
275 ctrl = ((reg & DPOT_RDAC_MASK) == DPOT_RDAC0) ?
276 0 : DPOT_AD5291_RDAC_AB;
277 return dpot_write_r8d8(dpot, ctrl, value);
278 break;
279 case DPOT_UID(AD5171_ID):
280 case DPOT_UID(AD5273_ID):
281 if (reg & DPOT_ADDR_OTP) {
282 tmp = dpot_read_d8(dpot);
283 if (tmp >> 6) /* Ready to Program? */
284 return -EFAULT;
285 ctrl = DPOT_AD5273_FUSE;
286 }
287 return dpot_write_r8d8(dpot, ctrl, value);
288 break;
289 case DPOT_UID(AD5172_ID):
290 case DPOT_UID(AD5173_ID):
291 ctrl = ((reg & DPOT_RDAC_MASK) == DPOT_RDAC0) ?
292 0 : DPOT_AD5272_3_A0;
293 if (reg & DPOT_ADDR_OTP) {
294 tmp = dpot_read_r8d16(dpot, ctrl);
295 if (tmp >> 14) /* Ready to Program? */
296 return -EFAULT;
297 ctrl |= DPOT_AD5270_2_3_FUSE;
298 }
299 return dpot_write_r8d8(dpot, ctrl, value);
300 break;
301 case DPOT_UID(AD5170_ID):
302 if (reg & DPOT_ADDR_OTP) {
303 tmp = dpot_read_r8d16(dpot, tmp);
304 if (tmp >> 14) /* Ready to Program? */
305 return -EFAULT;
306 ctrl = DPOT_AD5270_2_3_FUSE;
307 }
308 return dpot_write_r8d8(dpot, ctrl, value);
309 break;
310 default:
311 if (reg & DPOT_ADDR_CMD)
312 return dpot_write_d8(dpot, reg);
313
314 if (dpot->max_pos > 256)
315 return dpot_write_r8d16(dpot, (reg & 0xF8) |
316 ((reg & 0x7) << 1), value);
317 else
318 /* All other registers require instruction + data bytes */
319 return dpot_write_r8d8(dpot, reg, value);
320 }
321}
322
323
324static s32 dpot_write(struct dpot_data *dpot, u8 reg, u16 value)
325{
326 if (dpot->feat & F_SPI)
327 return dpot_write_spi(dpot, reg, value);
328 else
329 return dpot_write_i2c(dpot, reg, value);
330}
331
91/* sysfs functions */ 332/* sysfs functions */
92 333
93static ssize_t sysfs_show_reg(struct device *dev, 334static ssize_t sysfs_show_reg(struct device *dev,
94 struct device_attribute *attr, char *buf, u32 reg) 335 struct device_attribute *attr,
336 char *buf, u32 reg)
95{ 337{
96 struct i2c_client *client = to_i2c_client(dev); 338 struct dpot_data *data = dev_get_drvdata(dev);
97 struct dpot_data *data = i2c_get_clientdata(client);
98 s32 value; 339 s32 value;
99 340
341 if (reg & DPOT_ADDR_OTP_EN)
342 return sprintf(buf, "%s\n",
343 test_bit(DPOT_RDAC_MASK & reg, data->otp_en_mask) ?
344 "enabled" : "disabled");
345
346
100 mutex_lock(&data->update_lock); 347 mutex_lock(&data->update_lock);
101 value = ad525x_read(client, reg); 348 value = dpot_read(data, reg);
102 mutex_unlock(&data->update_lock); 349 mutex_unlock(&data->update_lock);
103 350
104 if (value < 0) 351 if (value < 0)
@@ -111,7 +358,7 @@ static ssize_t sysfs_show_reg(struct device *dev,
111 * datasheet (Rev. A) for more details. 358 * datasheet (Rev. A) for more details.
112 */ 359 */
113 360
114 if (reg & AD525X_REG_TOL) 361 if (reg & DPOT_REG_TOL)
115 return sprintf(buf, "0x%04x\n", value & 0xFFFF); 362 return sprintf(buf, "0x%04x\n", value & 0xFFFF);
116 else 363 else
117 return sprintf(buf, "%u\n", value & data->rdac_mask); 364 return sprintf(buf, "%u\n", value & data->rdac_mask);
@@ -121,11 +368,23 @@ static ssize_t sysfs_set_reg(struct device *dev,
121 struct device_attribute *attr, 368 struct device_attribute *attr,
122 const char *buf, size_t count, u32 reg) 369 const char *buf, size_t count, u32 reg)
123{ 370{
124 struct i2c_client *client = to_i2c_client(dev); 371 struct dpot_data *data = dev_get_drvdata(dev);
125 struct dpot_data *data = i2c_get_clientdata(client);
126 unsigned long value; 372 unsigned long value;
127 int err; 373 int err;
128 374
375 if (reg & DPOT_ADDR_OTP_EN) {
376 if (!strncmp(buf, "enabled", sizeof("enabled")))
377 set_bit(DPOT_RDAC_MASK & reg, data->otp_en_mask);
378 else
379 clear_bit(DPOT_RDAC_MASK & reg, data->otp_en_mask);
380
381 return count;
382 }
383
384 if ((reg & DPOT_ADDR_OTP) &&
385 !test_bit(DPOT_RDAC_MASK & reg, data->otp_en_mask))
386 return -EPERM;
387
129 err = strict_strtoul(buf, 10, &value); 388 err = strict_strtoul(buf, 10, &value);
130 if (err) 389 if (err)
131 return err; 390 return err;
@@ -134,9 +393,11 @@ static ssize_t sysfs_set_reg(struct device *dev,
134 value = data->rdac_mask; 393 value = data->rdac_mask;
135 394
136 mutex_lock(&data->update_lock); 395 mutex_lock(&data->update_lock);
137 ad525x_write(client, reg, value); 396 dpot_write(data, reg, value);
138 if (reg & AD525X_I2C_EEPROM) 397 if (reg & DPOT_ADDR_EEPROM)
139 msleep(26); /* Sleep while the EEPROM updates */ 398 msleep(26); /* Sleep while the EEPROM updates */
399 else if (reg & DPOT_ADDR_OTP)
400 msleep(400); /* Sleep while the OTP updates */
140 mutex_unlock(&data->update_lock); 401 mutex_unlock(&data->update_lock);
141 402
142 return count; 403 return count;
@@ -146,11 +407,10 @@ static ssize_t sysfs_do_cmd(struct device *dev,
146 struct device_attribute *attr, 407 struct device_attribute *attr,
147 const char *buf, size_t count, u32 reg) 408 const char *buf, size_t count, u32 reg)
148{ 409{
149 struct i2c_client *client = to_i2c_client(dev); 410 struct dpot_data *data = dev_get_drvdata(dev);
150 struct dpot_data *data = i2c_get_clientdata(client);
151 411
152 mutex_lock(&data->update_lock); 412 mutex_lock(&data->update_lock);
153 ad525x_write(client, reg, 0); 413 dpot_write(data, reg, 0);
154 mutex_unlock(&data->update_lock); 414 mutex_unlock(&data->update_lock);
155 415
156 return count; 416 return count;
@@ -158,244 +418,131 @@ static ssize_t sysfs_do_cmd(struct device *dev,
158 418
159/* ------------------------------------------------------------------------- */ 419/* ------------------------------------------------------------------------- */
160 420
161static ssize_t show_rdac0(struct device *dev, 421#define DPOT_DEVICE_SHOW(_name, _reg) static ssize_t \
162 struct device_attribute *attr, char *buf) 422show_##_name(struct device *dev, \
163{ 423 struct device_attribute *attr, char *buf) \
164 return sysfs_show_reg(dev, attr, buf, AD525X_I2C_RDAC | AD525X_RDAC0); 424{ \
165} 425 return sysfs_show_reg(dev, attr, buf, _reg); \
166 426}
167static ssize_t set_rdac0(struct device *dev, 427
168 struct device_attribute *attr, 428#define DPOT_DEVICE_SET(_name, _reg) static ssize_t \
169 const char *buf, size_t count) 429set_##_name(struct device *dev, \
170{ 430 struct device_attribute *attr, \
171 return sysfs_set_reg(dev, attr, buf, count, 431 const char *buf, size_t count) \
172 AD525X_I2C_RDAC | AD525X_RDAC0); 432{ \
173} 433 return sysfs_set_reg(dev, attr, buf, count, _reg); \
174 434}
175static DEVICE_ATTR(rdac0, S_IWUSR | S_IRUGO, show_rdac0, set_rdac0); 435
176 436#define DPOT_DEVICE_SHOW_SET(name, reg) \
177static ssize_t show_eeprom0(struct device *dev, 437DPOT_DEVICE_SHOW(name, reg) \
178 struct device_attribute *attr, char *buf) 438DPOT_DEVICE_SET(name, reg) \
179{ 439static DEVICE_ATTR(name, S_IWUSR | S_IRUGO, show_##name, set_##name);
180 return sysfs_show_reg(dev, attr, buf, AD525X_I2C_EEPROM | AD525X_RDAC0); 440
181} 441#define DPOT_DEVICE_SHOW_ONLY(name, reg) \
182 442DPOT_DEVICE_SHOW(name, reg) \
183static ssize_t set_eeprom0(struct device *dev, 443static DEVICE_ATTR(name, S_IWUSR | S_IRUGO, show_##name, NULL);
184 struct device_attribute *attr, 444
185 const char *buf, size_t count) 445DPOT_DEVICE_SHOW_SET(rdac0, DPOT_ADDR_RDAC | DPOT_RDAC0);
186{ 446DPOT_DEVICE_SHOW_SET(eeprom0, DPOT_ADDR_EEPROM | DPOT_RDAC0);
187 return sysfs_set_reg(dev, attr, buf, count, 447DPOT_DEVICE_SHOW_ONLY(tolerance0, DPOT_ADDR_EEPROM | DPOT_TOL_RDAC0);
188 AD525X_I2C_EEPROM | AD525X_RDAC0); 448DPOT_DEVICE_SHOW_SET(otp0, DPOT_ADDR_OTP | DPOT_RDAC0);
189} 449DPOT_DEVICE_SHOW_SET(otp0en, DPOT_ADDR_OTP_EN | DPOT_RDAC0);
190 450
191static DEVICE_ATTR(eeprom0, S_IWUSR | S_IRUGO, show_eeprom0, set_eeprom0); 451DPOT_DEVICE_SHOW_SET(rdac1, DPOT_ADDR_RDAC | DPOT_RDAC1);
192 452DPOT_DEVICE_SHOW_SET(eeprom1, DPOT_ADDR_EEPROM | DPOT_RDAC1);
193static ssize_t show_tolerance0(struct device *dev, 453DPOT_DEVICE_SHOW_ONLY(tolerance1, DPOT_ADDR_EEPROM | DPOT_TOL_RDAC1);
194 struct device_attribute *attr, char *buf) 454DPOT_DEVICE_SHOW_SET(otp1, DPOT_ADDR_OTP | DPOT_RDAC1);
195{ 455DPOT_DEVICE_SHOW_SET(otp1en, DPOT_ADDR_OTP_EN | DPOT_RDAC1);
196 return sysfs_show_reg(dev, attr, buf, 456
197 AD525X_I2C_EEPROM | AD525X_TOL_RDAC0); 457DPOT_DEVICE_SHOW_SET(rdac2, DPOT_ADDR_RDAC | DPOT_RDAC2);
198} 458DPOT_DEVICE_SHOW_SET(eeprom2, DPOT_ADDR_EEPROM | DPOT_RDAC2);
199 459DPOT_DEVICE_SHOW_ONLY(tolerance2, DPOT_ADDR_EEPROM | DPOT_TOL_RDAC2);
200static DEVICE_ATTR(tolerance0, S_IRUGO, show_tolerance0, NULL); 460DPOT_DEVICE_SHOW_SET(otp2, DPOT_ADDR_OTP | DPOT_RDAC2);
201 461DPOT_DEVICE_SHOW_SET(otp2en, DPOT_ADDR_OTP_EN | DPOT_RDAC2);
202/* ------------------------------------------------------------------------- */ 462
203 463DPOT_DEVICE_SHOW_SET(rdac3, DPOT_ADDR_RDAC | DPOT_RDAC3);
204static ssize_t show_rdac1(struct device *dev, 464DPOT_DEVICE_SHOW_SET(eeprom3, DPOT_ADDR_EEPROM | DPOT_RDAC3);
205 struct device_attribute *attr, char *buf) 465DPOT_DEVICE_SHOW_ONLY(tolerance3, DPOT_ADDR_EEPROM | DPOT_TOL_RDAC3);
206{ 466DPOT_DEVICE_SHOW_SET(otp3, DPOT_ADDR_OTP | DPOT_RDAC3);
207 return sysfs_show_reg(dev, attr, buf, AD525X_I2C_RDAC | AD525X_RDAC1); 467DPOT_DEVICE_SHOW_SET(otp3en, DPOT_ADDR_OTP_EN | DPOT_RDAC3);
208} 468
209 469DPOT_DEVICE_SHOW_SET(rdac4, DPOT_ADDR_RDAC | DPOT_RDAC4);
210static ssize_t set_rdac1(struct device *dev, 470DPOT_DEVICE_SHOW_SET(eeprom4, DPOT_ADDR_EEPROM | DPOT_RDAC4);
211 struct device_attribute *attr, 471DPOT_DEVICE_SHOW_ONLY(tolerance4, DPOT_ADDR_EEPROM | DPOT_TOL_RDAC4);
212 const char *buf, size_t count) 472DPOT_DEVICE_SHOW_SET(otp4, DPOT_ADDR_OTP | DPOT_RDAC4);
213{ 473DPOT_DEVICE_SHOW_SET(otp4en, DPOT_ADDR_OTP_EN | DPOT_RDAC4);
214 return sysfs_set_reg(dev, attr, buf, count, 474
215 AD525X_I2C_RDAC | AD525X_RDAC1); 475DPOT_DEVICE_SHOW_SET(rdac5, DPOT_ADDR_RDAC | DPOT_RDAC5);
216} 476DPOT_DEVICE_SHOW_SET(eeprom5, DPOT_ADDR_EEPROM | DPOT_RDAC5);
217 477DPOT_DEVICE_SHOW_ONLY(tolerance5, DPOT_ADDR_EEPROM | DPOT_TOL_RDAC5);
218static DEVICE_ATTR(rdac1, S_IWUSR | S_IRUGO, show_rdac1, set_rdac1); 478DPOT_DEVICE_SHOW_SET(otp5, DPOT_ADDR_OTP | DPOT_RDAC5);
219 479DPOT_DEVICE_SHOW_SET(otp5en, DPOT_ADDR_OTP_EN | DPOT_RDAC5);
220static ssize_t show_eeprom1(struct device *dev, 480
221 struct device_attribute *attr, char *buf) 481static const struct attribute *dpot_attrib_wipers[] = {
222{ 482 &dev_attr_rdac0.attr,
223 return sysfs_show_reg(dev, attr, buf, AD525X_I2C_EEPROM | AD525X_RDAC1); 483 &dev_attr_rdac1.attr,
224} 484 &dev_attr_rdac2.attr,
225 485 &dev_attr_rdac3.attr,
226static ssize_t set_eeprom1(struct device *dev, 486 &dev_attr_rdac4.attr,
227 struct device_attribute *attr, 487 &dev_attr_rdac5.attr,
228 const char *buf, size_t count) 488 NULL
229{ 489};
230 return sysfs_set_reg(dev, attr, buf, count,
231 AD525X_I2C_EEPROM | AD525X_RDAC1);
232}
233
234static DEVICE_ATTR(eeprom1, S_IWUSR | S_IRUGO, show_eeprom1, set_eeprom1);
235
236static ssize_t show_tolerance1(struct device *dev,
237 struct device_attribute *attr, char *buf)
238{
239 return sysfs_show_reg(dev, attr, buf,
240 AD525X_I2C_EEPROM | AD525X_TOL_RDAC1);
241}
242
243static DEVICE_ATTR(tolerance1, S_IRUGO, show_tolerance1, NULL);
244
245/* ------------------------------------------------------------------------- */
246
247static ssize_t show_rdac2(struct device *dev,
248 struct device_attribute *attr, char *buf)
249{
250 return sysfs_show_reg(dev, attr, buf, AD525X_I2C_RDAC | AD525X_RDAC2);
251}
252
253static ssize_t set_rdac2(struct device *dev,
254 struct device_attribute *attr,
255 const char *buf, size_t count)
256{
257 return sysfs_set_reg(dev, attr, buf, count,
258 AD525X_I2C_RDAC | AD525X_RDAC2);
259}
260
261static DEVICE_ATTR(rdac2, S_IWUSR | S_IRUGO, show_rdac2, set_rdac2);
262
263static ssize_t show_eeprom2(struct device *dev,
264 struct device_attribute *attr, char *buf)
265{
266 return sysfs_show_reg(dev, attr, buf, AD525X_I2C_EEPROM | AD525X_RDAC2);
267}
268
269static ssize_t set_eeprom2(struct device *dev,
270 struct device_attribute *attr,
271 const char *buf, size_t count)
272{
273 return sysfs_set_reg(dev, attr, buf, count,
274 AD525X_I2C_EEPROM | AD525X_RDAC2);
275}
276
277static DEVICE_ATTR(eeprom2, S_IWUSR | S_IRUGO, show_eeprom2, set_eeprom2);
278
279static ssize_t show_tolerance2(struct device *dev,
280 struct device_attribute *attr, char *buf)
281{
282 return sysfs_show_reg(dev, attr, buf,
283 AD525X_I2C_EEPROM | AD525X_TOL_RDAC2);
284}
285
286static DEVICE_ATTR(tolerance2, S_IRUGO, show_tolerance2, NULL);
287
288/* ------------------------------------------------------------------------- */
289
290static ssize_t show_rdac3(struct device *dev,
291 struct device_attribute *attr, char *buf)
292{
293 return sysfs_show_reg(dev, attr, buf, AD525X_I2C_RDAC | AD525X_RDAC3);
294}
295
296static ssize_t set_rdac3(struct device *dev,
297 struct device_attribute *attr,
298 const char *buf, size_t count)
299{
300 return sysfs_set_reg(dev, attr, buf, count,
301 AD525X_I2C_RDAC | AD525X_RDAC3);
302}
303
304static DEVICE_ATTR(rdac3, S_IWUSR | S_IRUGO, show_rdac3, set_rdac3);
305
306static ssize_t show_eeprom3(struct device *dev,
307 struct device_attribute *attr, char *buf)
308{
309 return sysfs_show_reg(dev, attr, buf, AD525X_I2C_EEPROM | AD525X_RDAC3);
310}
311
312static ssize_t set_eeprom3(struct device *dev,
313 struct device_attribute *attr,
314 const char *buf, size_t count)
315{
316 return sysfs_set_reg(dev, attr, buf, count,
317 AD525X_I2C_EEPROM | AD525X_RDAC3);
318}
319 490
320static DEVICE_ATTR(eeprom3, S_IWUSR | S_IRUGO, show_eeprom3, set_eeprom3); 491static const struct attribute *dpot_attrib_eeprom[] = {
492 &dev_attr_eeprom0.attr,
493 &dev_attr_eeprom1.attr,
494 &dev_attr_eeprom2.attr,
495 &dev_attr_eeprom3.attr,
496 &dev_attr_eeprom4.attr,
497 &dev_attr_eeprom5.attr,
498 NULL
499};
321 500
322static ssize_t show_tolerance3(struct device *dev, 501static const struct attribute *dpot_attrib_otp[] = {
323 struct device_attribute *attr, char *buf) 502 &dev_attr_otp0.attr,
324{ 503 &dev_attr_otp1.attr,
325 return sysfs_show_reg(dev, attr, buf, 504 &dev_attr_otp2.attr,
326 AD525X_I2C_EEPROM | AD525X_TOL_RDAC3); 505 &dev_attr_otp3.attr,
327} 506 &dev_attr_otp4.attr,
507 &dev_attr_otp5.attr,
508 NULL
509};
328 510
329static DEVICE_ATTR(tolerance3, S_IRUGO, show_tolerance3, NULL); 511static const struct attribute *dpot_attrib_otp_en[] = {
330 512 &dev_attr_otp0en.attr,
331static struct attribute *ad525x_attributes_wipers[4][4] = { 513 &dev_attr_otp1en.attr,
332 { 514 &dev_attr_otp2en.attr,
333 &dev_attr_rdac0.attr, 515 &dev_attr_otp3en.attr,
334 &dev_attr_eeprom0.attr, 516 &dev_attr_otp4en.attr,
335 &dev_attr_tolerance0.attr, 517 &dev_attr_otp5en.attr,
336 NULL 518 NULL
337 }, {
338 &dev_attr_rdac1.attr,
339 &dev_attr_eeprom1.attr,
340 &dev_attr_tolerance1.attr,
341 NULL
342 }, {
343 &dev_attr_rdac2.attr,
344 &dev_attr_eeprom2.attr,
345 &dev_attr_tolerance2.attr,
346 NULL
347 }, {
348 &dev_attr_rdac3.attr,
349 &dev_attr_eeprom3.attr,
350 &dev_attr_tolerance3.attr,
351 NULL
352 }
353}; 519};
354 520
355static const struct attribute_group ad525x_group_wipers[] = { 521static const struct attribute *dpot_attrib_tolerance[] = {
356 {.attrs = ad525x_attributes_wipers[AD525X_RDAC0]}, 522 &dev_attr_tolerance0.attr,
357 {.attrs = ad525x_attributes_wipers[AD525X_RDAC1]}, 523 &dev_attr_tolerance1.attr,
358 {.attrs = ad525x_attributes_wipers[AD525X_RDAC2]}, 524 &dev_attr_tolerance2.attr,
359 {.attrs = ad525x_attributes_wipers[AD525X_RDAC3]}, 525 &dev_attr_tolerance3.attr,
526 &dev_attr_tolerance4.attr,
527 &dev_attr_tolerance5.attr,
528 NULL
360}; 529};
361 530
362/* ------------------------------------------------------------------------- */ 531/* ------------------------------------------------------------------------- */
363 532
364static ssize_t set_inc_all(struct device *dev, 533#define DPOT_DEVICE_DO_CMD(_name, _cmd) static ssize_t \
365 struct device_attribute *attr, 534set_##_name(struct device *dev, \
366 const char *buf, size_t count) 535 struct device_attribute *attr, \
367{ 536 const char *buf, size_t count) \
368 return sysfs_do_cmd(dev, attr, buf, count, AD525X_INC_ALL); 537{ \
369} 538 return sysfs_do_cmd(dev, attr, buf, count, _cmd); \
539} \
540static DEVICE_ATTR(_name, S_IWUSR | S_IRUGO, NULL, set_##_name);
370 541
371static DEVICE_ATTR(inc_all, S_IWUSR, NULL, set_inc_all); 542DPOT_DEVICE_DO_CMD(inc_all, DPOT_INC_ALL);
372 543DPOT_DEVICE_DO_CMD(dec_all, DPOT_DEC_ALL);
373static ssize_t set_dec_all(struct device *dev, 544DPOT_DEVICE_DO_CMD(inc_all_6db, DPOT_INC_ALL_6DB);
374 struct device_attribute *attr, 545DPOT_DEVICE_DO_CMD(dec_all_6db, DPOT_DEC_ALL_6DB);
375 const char *buf, size_t count)
376{
377 return sysfs_do_cmd(dev, attr, buf, count, AD525X_DEC_ALL);
378}
379
380static DEVICE_ATTR(dec_all, S_IWUSR, NULL, set_dec_all);
381
382static ssize_t set_inc_all_6db(struct device *dev,
383 struct device_attribute *attr,
384 const char *buf, size_t count)
385{
386 return sysfs_do_cmd(dev, attr, buf, count, AD525X_INC_ALL_6DB);
387}
388
389static DEVICE_ATTR(inc_all_6db, S_IWUSR, NULL, set_inc_all_6db);
390
391static ssize_t set_dec_all_6db(struct device *dev,
392 struct device_attribute *attr,
393 const char *buf, size_t count)
394{
395 return sysfs_do_cmd(dev, attr, buf, count, AD525X_DEC_ALL_6DB);
396}
397
398static DEVICE_ATTR(dec_all_6db, S_IWUSR, NULL, set_dec_all_6db);
399 546
400static struct attribute *ad525x_attributes_commands[] = { 547static struct attribute *ad525x_attributes_commands[] = {
401 &dev_attr_inc_all.attr, 548 &dev_attr_inc_all.attr,
@@ -409,74 +556,56 @@ static const struct attribute_group ad525x_group_commands = {
409 .attrs = ad525x_attributes_commands, 556 .attrs = ad525x_attributes_commands,
410}; 557};
411 558
412/* ------------------------------------------------------------------------- */ 559__devinit int ad_dpot_add_files(struct device *dev,
413 560 unsigned features, unsigned rdac)
414/* i2c device functions */ 561{
562 int err = sysfs_create_file(&dev->kobj,
563 dpot_attrib_wipers[rdac]);
564 if (features & F_CMD_EEP)
565 err |= sysfs_create_file(&dev->kobj,
566 dpot_attrib_eeprom[rdac]);
567 if (features & F_CMD_TOL)
568 err |= sysfs_create_file(&dev->kobj,
569 dpot_attrib_tolerance[rdac]);
570 if (features & F_CMD_OTP) {
571 err |= sysfs_create_file(&dev->kobj,
572 dpot_attrib_otp_en[rdac]);
573 err |= sysfs_create_file(&dev->kobj,
574 dpot_attrib_otp[rdac]);
575 }
415 576
416/** 577 if (err)
417 * ad525x_read - return the value contained in the specified register 578 dev_err(dev, "failed to register sysfs hooks for RDAC%d\n",
418 * on the AD5258 device. 579 rdac);
419 * @client: value returned from i2c_new_device()
420 * @reg: the register to read
421 *
422 * If the tolerance register is specified, 2 bytes are returned.
423 * Otherwise, 1 byte is returned. A negative value indicates an error
424 * occurred while reading the register.
425 */
426static s32 ad525x_read(struct i2c_client *client, u8 reg)
427{
428 struct dpot_data *data = i2c_get_clientdata(client);
429 580
430 if ((reg & AD525X_REG_TOL) || (data->max_pos > 256)) 581 return err;
431 return i2c_smbus_read_word_data(client, (reg & 0xF8) |
432 ((reg & 0x7) << 1));
433 else
434 return i2c_smbus_read_byte_data(client, reg);
435} 582}
436 583
437/** 584inline void ad_dpot_remove_files(struct device *dev,
438 * ad525x_write - store the given value in the specified register on 585 unsigned features, unsigned rdac)
439 * the AD5258 device. 586{
440 * @client: value returned from i2c_new_device() 587 sysfs_remove_file(&dev->kobj,
441 * @reg: the register to write 588 dpot_attrib_wipers[rdac]);
442 * @value: the byte to store in the register 589 if (features & F_CMD_EEP)
443 * 590 sysfs_remove_file(&dev->kobj,
444 * For certain instructions that do not require a data byte, "NULL" 591 dpot_attrib_eeprom[rdac]);
445 * should be specified for the "value" parameter. These instructions 592 if (features & F_CMD_TOL)
446 * include NOP, RESTORE_FROM_EEPROM, and STORE_TO_EEPROM. 593 sysfs_remove_file(&dev->kobj,
447 * 594 dpot_attrib_tolerance[rdac]);
448 * A negative return value indicates an error occurred while reading 595 if (features & F_CMD_OTP) {
449 * the register. 596 sysfs_remove_file(&dev->kobj,
450 */ 597 dpot_attrib_otp_en[rdac]);
451static s32 ad525x_write(struct i2c_client *client, u8 reg, u8 value) 598 sysfs_remove_file(&dev->kobj,
452{ 599 dpot_attrib_otp[rdac]);
453 struct dpot_data *data = i2c_get_clientdata(client); 600 }
454
455 /* Only write the instruction byte for certain commands */
456 if (reg & AD525X_I2C_CMD)
457 return i2c_smbus_write_byte(client, reg);
458
459 if (data->max_pos > 256)
460 return i2c_smbus_write_word_data(client, (reg & 0xF8) |
461 ((reg & 0x7) << 1), value);
462 else
463 /* All other registers require instruction + data bytes */
464 return i2c_smbus_write_byte_data(client, reg, value);
465} 601}
466 602
467static int ad525x_probe(struct i2c_client *client, 603__devinit int ad_dpot_probe(struct device *dev,
468 const struct i2c_device_id *id) 604 struct ad_dpot_bus_data *bdata, const struct ad_dpot_id *id)
469{ 605{
470 struct device *dev = &client->dev;
471 struct dpot_data *data;
472 int err = 0;
473 606
474 dev_dbg(dev, "%s\n", __func__); 607 struct dpot_data *data;
475 608 int i, err = 0;
476 if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_BYTE)) {
477 dev_err(dev, "missing I2C functionality for this driver\n");
478 goto exit;
479 }
480 609
481 data = kzalloc(sizeof(struct dpot_data), GFP_KERNEL); 610 data = kzalloc(sizeof(struct dpot_data), GFP_KERNEL);
482 if (!data) { 611 if (!data) {
@@ -484,183 +613,74 @@ static int ad525x_probe(struct i2c_client *client,
484 goto exit; 613 goto exit;
485 } 614 }
486 615
487 i2c_set_clientdata(client, data); 616 dev_set_drvdata(dev, data);
488 mutex_init(&data->update_lock); 617 mutex_init(&data->update_lock);
489 618
490 switch (id->driver_data) { 619 data->bdata = *bdata;
491 case AD5258_ID: 620 data->devid = id->devid;
492 data->max_pos = AD5258_MAX_POSITION; 621
493 err = sysfs_create_group(&dev->kobj, 622 data->max_pos = 1 << DPOT_MAX_POS(data->devid);
494 &ad525x_group_wipers[AD525X_RDAC0]); 623 data->rdac_mask = data->max_pos - 1;
495 break; 624 data->feat = DPOT_FEAT(data->devid);
496 case AD5259_ID: 625 data->uid = DPOT_UID(data->devid);
497 data->max_pos = AD5259_MAX_POSITION; 626 data->wipers = DPOT_WIPERS(data->devid);
498 err = sysfs_create_group(&dev->kobj, 627
499 &ad525x_group_wipers[AD525X_RDAC0]); 628 for (i = DPOT_RDAC0; i < MAX_RDACS; i++)
500 break; 629 if (data->wipers & (1 << i)) {
501 case AD5251_ID: 630 err = ad_dpot_add_files(dev, data->feat, i);
502 data->max_pos = AD5251_MAX_POSITION; 631 if (err)
503 err = sysfs_create_group(&dev->kobj, 632 goto exit_remove_files;
504 &ad525x_group_wipers[AD525X_RDAC1]); 633 /* power-up midscale */
505 err |= sysfs_create_group(&dev->kobj, 634 if (data->feat & F_RDACS_WONLY)
506 &ad525x_group_wipers[AD525X_RDAC3]); 635 data->rdac_cache[i] = data->max_pos / 2;
507 err |= sysfs_create_group(&dev->kobj, &ad525x_group_commands); 636 }
508 break; 637
509 case AD5252_ID: 638 if (data->feat & F_CMD_INC)
510 data->max_pos = AD5252_MAX_POSITION; 639 err = sysfs_create_group(&dev->kobj, &ad525x_group_commands);
511 err = sysfs_create_group(&dev->kobj,
512 &ad525x_group_wipers[AD525X_RDAC1]);
513 err |= sysfs_create_group(&dev->kobj,
514 &ad525x_group_wipers[AD525X_RDAC3]);
515 err |= sysfs_create_group(&dev->kobj, &ad525x_group_commands);
516 break;
517 case AD5253_ID:
518 data->max_pos = AD5253_MAX_POSITION;
519 err = sysfs_create_group(&dev->kobj,
520 &ad525x_group_wipers[AD525X_RDAC0]);
521 err |= sysfs_create_group(&dev->kobj,
522 &ad525x_group_wipers[AD525X_RDAC1]);
523 err |= sysfs_create_group(&dev->kobj,
524 &ad525x_group_wipers[AD525X_RDAC2]);
525 err |= sysfs_create_group(&dev->kobj,
526 &ad525x_group_wipers[AD525X_RDAC3]);
527 err |= sysfs_create_group(&dev->kobj, &ad525x_group_commands);
528 break;
529 case AD5254_ID:
530 data->max_pos = AD5254_MAX_POSITION;
531 err = sysfs_create_group(&dev->kobj,
532 &ad525x_group_wipers[AD525X_RDAC0]);
533 err |= sysfs_create_group(&dev->kobj,
534 &ad525x_group_wipers[AD525X_RDAC1]);
535 err |= sysfs_create_group(&dev->kobj,
536 &ad525x_group_wipers[AD525X_RDAC2]);
537 err |= sysfs_create_group(&dev->kobj,
538 &ad525x_group_wipers[AD525X_RDAC3]);
539 err |= sysfs_create_group(&dev->kobj, &ad525x_group_commands);
540 break;
541 case AD5255_ID:
542 data->max_pos = AD5255_MAX_POSITION;
543 err = sysfs_create_group(&dev->kobj,
544 &ad525x_group_wipers[AD525X_RDAC0]);
545 err |= sysfs_create_group(&dev->kobj,
546 &ad525x_group_wipers[AD525X_RDAC1]);
547 err |= sysfs_create_group(&dev->kobj,
548 &ad525x_group_wipers[AD525X_RDAC2]);
549 err |= sysfs_create_group(&dev->kobj, &ad525x_group_commands);
550 break;
551 default:
552 err = -ENODEV;
553 goto exit_free;
554 }
555 640
556 if (err) { 641 if (err) {
557 dev_err(dev, "failed to register sysfs hooks\n"); 642 dev_err(dev, "failed to register sysfs hooks\n");
558 goto exit_free; 643 goto exit_free;
559 } 644 }
560 645
561 data->devid = id->driver_data;
562 data->rdac_mask = data->max_pos - 1;
563
564 dev_info(dev, "%s %d-Position Digital Potentiometer registered\n", 646 dev_info(dev, "%s %d-Position Digital Potentiometer registered\n",
565 id->name, data->max_pos); 647 id->name, data->max_pos);
566 648
567 return 0; 649 return 0;
568 650
651exit_remove_files:
652 for (i = DPOT_RDAC0; i < MAX_RDACS; i++)
653 if (data->wipers & (1 << i))
654 ad_dpot_remove_files(dev, data->feat, i);
655
569exit_free: 656exit_free:
570 kfree(data); 657 kfree(data);
571 i2c_set_clientdata(client, NULL); 658 dev_set_drvdata(dev, NULL);
572exit: 659exit:
573 dev_err(dev, "failed to create client\n"); 660 dev_err(dev, "failed to create client for %s ID 0x%lX\n",
661 id->name, id->devid);
574 return err; 662 return err;
575} 663}
664EXPORT_SYMBOL(ad_dpot_probe);
576 665
577static int __devexit ad525x_remove(struct i2c_client *client) 666__devexit int ad_dpot_remove(struct device *dev)
578{ 667{
579 struct dpot_data *data = i2c_get_clientdata(client); 668 struct dpot_data *data = dev_get_drvdata(dev);
580 struct device *dev = &client->dev; 669 int i;
581 670
582 switch (data->devid) { 671 for (i = DPOT_RDAC0; i < MAX_RDACS; i++)
583 case AD5258_ID: 672 if (data->wipers & (1 << i))
584 case AD5259_ID: 673 ad_dpot_remove_files(dev, data->feat, i);
585 sysfs_remove_group(&dev->kobj,
586 &ad525x_group_wipers[AD525X_RDAC0]);
587 break;
588 case AD5251_ID:
589 case AD5252_ID:
590 sysfs_remove_group(&dev->kobj,
591 &ad525x_group_wipers[AD525X_RDAC1]);
592 sysfs_remove_group(&dev->kobj,
593 &ad525x_group_wipers[AD525X_RDAC3]);
594 sysfs_remove_group(&dev->kobj, &ad525x_group_commands);
595 break;
596 case AD5253_ID:
597 case AD5254_ID:
598 sysfs_remove_group(&dev->kobj,
599 &ad525x_group_wipers[AD525X_RDAC0]);
600 sysfs_remove_group(&dev->kobj,
601 &ad525x_group_wipers[AD525X_RDAC1]);
602 sysfs_remove_group(&dev->kobj,
603 &ad525x_group_wipers[AD525X_RDAC2]);
604 sysfs_remove_group(&dev->kobj,
605 &ad525x_group_wipers[AD525X_RDAC3]);
606 sysfs_remove_group(&dev->kobj, &ad525x_group_commands);
607 break;
608 case AD5255_ID:
609 sysfs_remove_group(&dev->kobj,
610 &ad525x_group_wipers[AD525X_RDAC0]);
611 sysfs_remove_group(&dev->kobj,
612 &ad525x_group_wipers[AD525X_RDAC1]);
613 sysfs_remove_group(&dev->kobj,
614 &ad525x_group_wipers[AD525X_RDAC2]);
615 sysfs_remove_group(&dev->kobj, &ad525x_group_commands);
616 break;
617 }
618 674
619 i2c_set_clientdata(client, NULL);
620 kfree(data); 675 kfree(data);
621 676
622 return 0; 677 return 0;
623} 678}
679EXPORT_SYMBOL(ad_dpot_remove);
624 680
625static const struct i2c_device_id ad525x_idtable[] = {
626 {"ad5258", AD5258_ID},
627 {"ad5259", AD5259_ID},
628 {"ad5251", AD5251_ID},
629 {"ad5252", AD5252_ID},
630 {"ad5253", AD5253_ID},
631 {"ad5254", AD5254_ID},
632 {"ad5255", AD5255_ID},
633 {}
634};
635
636MODULE_DEVICE_TABLE(i2c, ad525x_idtable);
637
638static struct i2c_driver ad525x_driver = {
639 .driver = {
640 .owner = THIS_MODULE,
641 .name = DRIVER_NAME,
642 },
643 .id_table = ad525x_idtable,
644 .probe = ad525x_probe,
645 .remove = __devexit_p(ad525x_remove),
646};
647
648static int __init ad525x_init(void)
649{
650 return i2c_add_driver(&ad525x_driver);
651}
652
653module_init(ad525x_init);
654
655static void __exit ad525x_exit(void)
656{
657 i2c_del_driver(&ad525x_driver);
658}
659
660module_exit(ad525x_exit);
661 681
662MODULE_AUTHOR("Chris Verges <chrisv@cyberswitching.com>, " 682MODULE_AUTHOR("Chris Verges <chrisv@cyberswitching.com>, "
663 "Michael Hennerich <hennerich@blackfin.uclinux.org>, "); 683 "Michael Hennerich <hennerich@blackfin.uclinux.org>");
664MODULE_DESCRIPTION("AD5258/9 digital potentiometer driver"); 684MODULE_DESCRIPTION("Digital potentiometer driver");
665MODULE_LICENSE("GPL"); 685MODULE_LICENSE("GPL");
666MODULE_VERSION(DRIVER_VERSION); 686MODULE_VERSION(DRIVER_VERSION);
diff --git a/drivers/misc/ad525x_dpot.h b/drivers/misc/ad525x_dpot.h
new file mode 100644
index 000000000000..78b89fd2e2fd
--- /dev/null
+++ b/drivers/misc/ad525x_dpot.h
@@ -0,0 +1,202 @@
1/*
2 * Driver for the Analog Devices digital potentiometers
3 *
4 * Copyright (C) 2010 Michael Hennerich, Analog Devices Inc.
5 *
6 * Licensed under the GPL-2 or later.
7 */
8
9#ifndef _AD_DPOT_H_
10#define _AD_DPOT_H_
11
12#include <linux/types.h>
13
14#define DPOT_CONF(features, wipers, max_pos, uid) \
15 (((features) << 18) | (((wipers) & 0xFF) << 10) | \
16 ((max_pos & 0xF) << 6) | (uid & 0x3F))
17
18#define DPOT_UID(conf) (conf & 0x3F)
19#define DPOT_MAX_POS(conf) ((conf >> 6) & 0xF)
20#define DPOT_WIPERS(conf) ((conf >> 10) & 0xFF)
21#define DPOT_FEAT(conf) (conf >> 18)
22
23#define BRDAC0 (1 << 0)
24#define BRDAC1 (1 << 1)
25#define BRDAC2 (1 << 2)
26#define BRDAC3 (1 << 3)
27#define BRDAC4 (1 << 4)
28#define BRDAC5 (1 << 5)
29#define MAX_RDACS 6
30
31#define F_CMD_INC (1 << 0) /* Features INC/DEC ALL, 6dB */
32#define F_CMD_EEP (1 << 1) /* Features EEPROM */
33#define F_CMD_OTP (1 << 2) /* Features OTP */
34#define F_CMD_TOL (1 << 3) /* RDACS feature Tolerance REG */
35#define F_RDACS_RW (1 << 4) /* RDACS are Read/Write */
36#define F_RDACS_WONLY (1 << 5) /* RDACS are Write only */
37#define F_AD_APPDATA (1 << 6) /* RDAC Address append to data */
38#define F_SPI_8BIT (1 << 7) /* All SPI XFERS are 8-bit */
39#define F_SPI_16BIT (1 << 8) /* All SPI XFERS are 16-bit */
40#define F_SPI_24BIT (1 << 9) /* All SPI XFERS are 24-bit */
41
42#define F_RDACS_RW_TOL (F_RDACS_RW | F_CMD_EEP | F_CMD_TOL)
43#define F_RDACS_RW_EEP (F_RDACS_RW | F_CMD_EEP)
44#define F_SPI (F_SPI_8BIT | F_SPI_16BIT | F_SPI_24BIT)
45
46enum dpot_devid {
47 AD5258_ID = DPOT_CONF(F_RDACS_RW_TOL, BRDAC0, 6, 0), /* I2C */
48 AD5259_ID = DPOT_CONF(F_RDACS_RW_TOL, BRDAC0, 8, 1),
49 AD5251_ID = DPOT_CONF(F_RDACS_RW_TOL | F_CMD_INC,
50 BRDAC0 | BRDAC3, 6, 2),
51 AD5252_ID = DPOT_CONF(F_RDACS_RW_TOL | F_CMD_INC,
52 BRDAC0 | BRDAC3, 8, 3),
53 AD5253_ID = DPOT_CONF(F_RDACS_RW_TOL | F_CMD_INC,
54 BRDAC0 | BRDAC1 | BRDAC2 | BRDAC3, 6, 4),
55 AD5254_ID = DPOT_CONF(F_RDACS_RW_TOL | F_CMD_INC,
56 BRDAC0 | BRDAC1 | BRDAC2 | BRDAC3, 8, 5),
57 AD5255_ID = DPOT_CONF(F_RDACS_RW_TOL | F_CMD_INC,
58 BRDAC0 | BRDAC1 | BRDAC2, 9, 6),
59 AD5160_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_8BIT,
60 BRDAC0, 8, 7), /* SPI */
61 AD5161_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_8BIT,
62 BRDAC0, 8, 8),
63 AD5162_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_16BIT,
64 BRDAC0 | BRDAC1, 8, 9),
65 AD5165_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_8BIT,
66 BRDAC0, 8, 10),
67 AD5200_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_8BIT,
68 BRDAC0, 8, 11),
69 AD5201_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_8BIT,
70 BRDAC0, 5, 12),
71 AD5203_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_8BIT,
72 BRDAC0 | BRDAC1 | BRDAC2 | BRDAC3, 6, 13),
73 AD5204_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_16BIT,
74 BRDAC0 | BRDAC1 | BRDAC2 | BRDAC3, 8, 14),
75 AD5206_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_16BIT,
76 BRDAC0 | BRDAC1 | BRDAC2 | BRDAC3 | BRDAC4 | BRDAC5,
77 8, 15),
78 AD5207_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_16BIT,
79 BRDAC0 | BRDAC1, 8, 16),
80 AD5231_ID = DPOT_CONF(F_RDACS_RW_EEP | F_CMD_INC | F_SPI_24BIT,
81 BRDAC0, 10, 17),
82 AD5232_ID = DPOT_CONF(F_RDACS_RW_EEP | F_CMD_INC | F_SPI_16BIT,
83 BRDAC0 | BRDAC1, 8, 18),
84 AD5233_ID = DPOT_CONF(F_RDACS_RW_EEP | F_CMD_INC | F_SPI_16BIT,
85 BRDAC0 | BRDAC1 | BRDAC2 | BRDAC3, 6, 19),
86 AD5235_ID = DPOT_CONF(F_RDACS_RW_EEP | F_CMD_INC | F_SPI_24BIT,
87 BRDAC0 | BRDAC1, 10, 20),
88 AD5260_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_8BIT,
89 BRDAC0, 8, 21),
90 AD5262_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_16BIT,
91 BRDAC0 | BRDAC1, 8, 22),
92 AD5263_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_16BIT,
93 BRDAC0 | BRDAC1 | BRDAC2 | BRDAC3, 8, 23),
94 AD5290_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_8BIT,
95 BRDAC0, 8, 24),
96 AD5291_ID = DPOT_CONF(F_RDACS_RW | F_SPI_16BIT, BRDAC0, 8, 25),
97 AD5292_ID = DPOT_CONF(F_RDACS_RW | F_SPI_16BIT, BRDAC0, 10, 26),
98 AD5293_ID = DPOT_CONF(F_RDACS_RW | F_SPI_16BIT, BRDAC0, 10, 27),
99 AD7376_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_8BIT,
100 BRDAC0, 7, 28),
101 AD8400_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_8BIT,
102 BRDAC0, 8, 29),
103 AD8402_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_16BIT,
104 BRDAC0 | BRDAC1, 8, 30),
105 AD8403_ID = DPOT_CONF(F_RDACS_WONLY | F_AD_APPDATA | F_SPI_16BIT,
106 BRDAC0 | BRDAC1 | BRDAC2, 8, 31),
107 ADN2850_ID = DPOT_CONF(F_RDACS_RW_EEP | F_CMD_INC | F_SPI_24BIT,
108 BRDAC0 | BRDAC1, 10, 32),
109 AD5241_ID = DPOT_CONF(F_RDACS_RW, BRDAC0, 8, 33),
110 AD5242_ID = DPOT_CONF(F_RDACS_RW, BRDAC0 | BRDAC1, 8, 34),
111 AD5243_ID = DPOT_CONF(F_RDACS_RW, BRDAC0 | BRDAC1, 8, 35),
112 AD5245_ID = DPOT_CONF(F_RDACS_RW, BRDAC0, 8, 36),
113 AD5246_ID = DPOT_CONF(F_RDACS_RW, BRDAC0, 7, 37),
114 AD5247_ID = DPOT_CONF(F_RDACS_RW, BRDAC0, 7, 38),
115 AD5248_ID = DPOT_CONF(F_RDACS_RW, BRDAC0 | BRDAC1, 8, 39),
116 AD5280_ID = DPOT_CONF(F_RDACS_RW, BRDAC0, 8, 40),
117 AD5282_ID = DPOT_CONF(F_RDACS_RW, BRDAC0 | BRDAC1, 8, 41),
118 ADN2860_ID = DPOT_CONF(F_RDACS_RW_TOL | F_CMD_INC,
119 BRDAC0 | BRDAC1 | BRDAC2, 9, 42),
120 AD5273_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP, BRDAC0, 6, 43),
121 AD5171_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP, BRDAC0, 6, 44),
122 AD5170_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP, BRDAC0, 8, 45),
123 AD5172_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP, BRDAC0 | BRDAC1, 8, 46),
124 AD5173_ID = DPOT_CONF(F_RDACS_RW | F_CMD_OTP, BRDAC0 | BRDAC1, 8, 47),
125};
126
127#define DPOT_RDAC0 0
128#define DPOT_RDAC1 1
129#define DPOT_RDAC2 2
130#define DPOT_RDAC3 3
131#define DPOT_RDAC4 4
132#define DPOT_RDAC5 5
133
134#define DPOT_RDAC_MASK 0x1F
135
136#define DPOT_REG_TOL 0x18
137#define DPOT_TOL_RDAC0 (DPOT_REG_TOL | DPOT_RDAC0)
138#define DPOT_TOL_RDAC1 (DPOT_REG_TOL | DPOT_RDAC1)
139#define DPOT_TOL_RDAC2 (DPOT_REG_TOL | DPOT_RDAC2)
140#define DPOT_TOL_RDAC3 (DPOT_REG_TOL | DPOT_RDAC3)
141#define DPOT_TOL_RDAC4 (DPOT_REG_TOL | DPOT_RDAC4)
142#define DPOT_TOL_RDAC5 (DPOT_REG_TOL | DPOT_RDAC5)
143
144/* RDAC-to-EEPROM Interface Commands */
145#define DPOT_ADDR_RDAC (0x0 << 5)
146#define DPOT_ADDR_EEPROM (0x1 << 5)
147#define DPOT_ADDR_OTP (0x1 << 6)
148#define DPOT_ADDR_CMD (0x1 << 7)
149#define DPOT_ADDR_OTP_EN (0x1 << 9)
150
151#define DPOT_DEC_ALL_6DB (DPOT_ADDR_CMD | (0x4 << 3))
152#define DPOT_INC_ALL_6DB (DPOT_ADDR_CMD | (0x9 << 3))
153#define DPOT_DEC_ALL (DPOT_ADDR_CMD | (0x6 << 3))
154#define DPOT_INC_ALL (DPOT_ADDR_CMD | (0xB << 3))
155
156#define DPOT_SPI_RDAC 0xB0
157#define DPOT_SPI_EEPROM 0x30
158#define DPOT_SPI_READ_RDAC 0xA0
159#define DPOT_SPI_READ_EEPROM 0x90
160#define DPOT_SPI_DEC_ALL_6DB 0x50
161#define DPOT_SPI_INC_ALL_6DB 0xD0
162#define DPOT_SPI_DEC_ALL 0x70
163#define DPOT_SPI_INC_ALL 0xF0
164
165/* AD5291/2/3 use special commands */
166#define DPOT_AD5291_RDAC 0x01
167#define DPOT_AD5291_READ_RDAC 0x02
168
169/* AD524x use special commands */
170#define DPOT_AD5291_RDAC_AB 0x80
171
172#define DPOT_AD5273_FUSE 0x80
173#define DPOT_AD5270_2_3_FUSE 0x20
174#define DPOT_AD5270_2_3_OW 0x08
175#define DPOT_AD5272_3_A0 0x08
176#define DPOT_AD5270_2FUSE 0x80
177
178struct dpot_data;
179
180struct ad_dpot_bus_ops {
181 int (*read_d8) (void *client);
182 int (*read_r8d8) (void *client, u8 reg);
183 int (*read_r8d16) (void *client, u8 reg);
184 int (*write_d8) (void *client, u8 val);
185 int (*write_r8d8) (void *client, u8 reg, u8 val);
186 int (*write_r8d16) (void *client, u8 reg, u16 val);
187};
188
189struct ad_dpot_bus_data {
190 void *client;
191 const struct ad_dpot_bus_ops *bops;
192};
193
194struct ad_dpot_id {
195 char *name;
196 unsigned long devid;
197};
198
199int ad_dpot_probe(struct device *dev, struct ad_dpot_bus_data *bdata, const struct ad_dpot_id *id);
200int ad_dpot_remove(struct device *dev);
201
202#endif
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index a441aad922c2..3b7ab20a5c54 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -5162,13 +5162,6 @@ static void proc_SSID_on_close(struct inode *inode, struct file *file)
5162 enable_MAC(ai, 1); 5162 enable_MAC(ai, 1);
5163} 5163}
5164 5164
5165static inline u8 hexVal(char c) {
5166 if (c>='0' && c<='9') return c -= '0';
5167 if (c>='a' && c<='f') return c -= 'a'-10;
5168 if (c>='A' && c<='F') return c -= 'A'-10;
5169 return 0;
5170}
5171
5172static void proc_APList_on_close( struct inode *inode, struct file *file ) { 5165static void proc_APList_on_close( struct inode *inode, struct file *file ) {
5173 struct proc_data *data = (struct proc_data *)file->private_data; 5166 struct proc_data *data = (struct proc_data *)file->private_data;
5174 struct proc_dir_entry *dp = PDE(inode); 5167 struct proc_dir_entry *dp = PDE(inode);
@@ -5188,11 +5181,11 @@ static void proc_APList_on_close( struct inode *inode, struct file *file ) {
5188 switch(j%3) { 5181 switch(j%3) {
5189 case 0: 5182 case 0:
5190 APList_rid.ap[i][j/3]= 5183 APList_rid.ap[i][j/3]=
5191 hexVal(data->wbuffer[j+i*6*3])<<4; 5184 hex_to_bin(data->wbuffer[j+i*6*3])<<4;
5192 break; 5185 break;
5193 case 1: 5186 case 1:
5194 APList_rid.ap[i][j/3]|= 5187 APList_rid.ap[i][j/3]|=
5195 hexVal(data->wbuffer[j+i*6*3]); 5188 hex_to_bin(data->wbuffer[j+i*6*3]);
5196 break; 5189 break;
5197 } 5190 }
5198 } 5191 }
@@ -5340,10 +5333,10 @@ static void proc_wepkey_on_close( struct inode *inode, struct file *file ) {
5340 for( i = 0; i < 16*3 && data->wbuffer[i+j]; i++ ) { 5333 for( i = 0; i < 16*3 && data->wbuffer[i+j]; i++ ) {
5341 switch(i%3) { 5334 switch(i%3) {
5342 case 0: 5335 case 0:
5343 key[i/3] = hexVal(data->wbuffer[i+j])<<4; 5336 key[i/3] = hex_to_bin(data->wbuffer[i+j])<<4;
5344 break; 5337 break;
5345 case 1: 5338 case 1:
5346 key[i/3] |= hexVal(data->wbuffer[i+j]); 5339 key[i/3] |= hex_to_bin(data->wbuffer[i+j]);
5347 break; 5340 break;
5348 } 5341 }
5349 } 5342 }
diff --git a/drivers/power/power_supply_sysfs.c b/drivers/power/power_supply_sysfs.c
index 6a86cdfd79fa..9d30eeb8c810 100644
--- a/drivers/power/power_supply_sysfs.c
+++ b/drivers/power/power_supply_sysfs.c
@@ -179,14 +179,16 @@ static mode_t power_supply_attr_is_visible(struct kobject *kobj,
179{ 179{
180 struct device *dev = container_of(kobj, struct device, kobj); 180 struct device *dev = container_of(kobj, struct device, kobj);
181 struct power_supply *psy = dev_get_drvdata(dev); 181 struct power_supply *psy = dev_get_drvdata(dev);
182 mode_t mode = S_IRUSR | S_IRGRP | S_IROTH;
182 int i; 183 int i;
183 184
185 if (attrno == POWER_SUPPLY_PROP_TYPE)
186 return mode;
187
184 for (i = 0; i < psy->num_properties; i++) { 188 for (i = 0; i < psy->num_properties; i++) {
185 int property = psy->properties[i]; 189 int property = psy->properties[i];
186 190
187 if (property == attrno) { 191 if (property == attrno) {
188 mode_t mode = S_IRUSR | S_IRGRP | S_IROTH;
189
190 if (psy->property_is_writeable && 192 if (psy->property_is_writeable &&
191 psy->property_is_writeable(psy, property) > 0) 193 psy->property_is_writeable(psy, property) > 0)
192 mode |= S_IWUSR; 194 mode |= S_IWUSR;
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 50ac047cd136..f1598324344c 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -640,7 +640,7 @@ config RTC_DRV_OMAP
640 640
641config RTC_DRV_S3C 641config RTC_DRV_S3C
642 tristate "Samsung S3C series SoC RTC" 642 tristate "Samsung S3C series SoC RTC"
643 depends on ARCH_S3C2410 643 depends on ARCH_S3C2410 || ARCH_S3C64XX
644 help 644 help
645 RTC (Realtime Clock) driver for the clock inbuilt into the 645 RTC (Realtime Clock) driver for the clock inbuilt into the
646 Samsung S3C24XX series of SoCs. This can provide periodic 646 Samsung S3C24XX series of SoCs. This can provide periodic
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index 96e8e70fbf1e..11b8ea29d2b7 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -719,6 +719,9 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
719 } 719 }
720 } 720 }
721 721
722 cmos_rtc.dev = dev;
723 dev_set_drvdata(dev, &cmos_rtc);
724
722 cmos_rtc.rtc = rtc_device_register(driver_name, dev, 725 cmos_rtc.rtc = rtc_device_register(driver_name, dev,
723 &cmos_rtc_ops, THIS_MODULE); 726 &cmos_rtc_ops, THIS_MODULE);
724 if (IS_ERR(cmos_rtc.rtc)) { 727 if (IS_ERR(cmos_rtc.rtc)) {
@@ -726,8 +729,6 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
726 goto cleanup0; 729 goto cleanup0;
727 } 730 }
728 731
729 cmos_rtc.dev = dev;
730 dev_set_drvdata(dev, &cmos_rtc);
731 rename_region(ports, dev_name(&cmos_rtc.rtc->dev)); 732 rename_region(ports, dev_name(&cmos_rtc.rtc->dev));
732 733
733 spin_lock_irq(&rtc_lock); 734 spin_lock_irq(&rtc_lock);
diff --git a/drivers/rtc/rtc-ds1302.c b/drivers/rtc/rtc-ds1302.c
index 532acf9b05d8..359d1e04626c 100644
--- a/drivers/rtc/rtc-ds1302.c
+++ b/drivers/rtc/rtc-ds1302.c
@@ -16,7 +16,6 @@
16#include <linux/rtc.h> 16#include <linux/rtc.h>
17#include <linux/io.h> 17#include <linux/io.h>
18#include <linux/bcd.h> 18#include <linux/bcd.h>
19#include <asm/rtc.h>
20 19
21#define DRV_NAME "rtc-ds1302" 20#define DRV_NAME "rtc-ds1302"
22#define DRV_VERSION "0.1.1" 21#define DRV_VERSION "0.1.1"
@@ -34,14 +33,55 @@
34#define RTC_ADDR_MIN 0x01 /* Address of minute register */ 33#define RTC_ADDR_MIN 0x01 /* Address of minute register */
35#define RTC_ADDR_SEC 0x00 /* Address of second register */ 34#define RTC_ADDR_SEC 0x00 /* Address of second register */
36 35
36#ifdef CONFIG_SH_SECUREEDGE5410
37#include <asm/rtc.h>
38#include <mach/snapgear.h>
39
37#define RTC_RESET 0x1000 40#define RTC_RESET 0x1000
38#define RTC_IODATA 0x0800 41#define RTC_IODATA 0x0800
39#define RTC_SCLK 0x0400 42#define RTC_SCLK 0x0400
40 43
41#ifdef CONFIG_SH_SECUREEDGE5410
42#include <mach/snapgear.h>
43#define set_dp(x) SECUREEDGE_WRITE_IOPORT(x, 0x1c00) 44#define set_dp(x) SECUREEDGE_WRITE_IOPORT(x, 0x1c00)
44#define get_dp() SECUREEDGE_READ_IOPORT() 45#define get_dp() SECUREEDGE_READ_IOPORT()
46#define ds1302_set_tx()
47#define ds1302_set_rx()
48
49static inline int ds1302_hw_init(void)
50{
51 return 0;
52}
53
54static inline void ds1302_reset(void)
55{
56 set_dp(get_dp() & ~(RTC_RESET | RTC_IODATA | RTC_SCLK));
57}
58
59static inline void ds1302_clock(void)
60{
61 set_dp(get_dp() | RTC_SCLK); /* clock high */
62 set_dp(get_dp() & ~RTC_SCLK); /* clock low */
63}
64
65static inline void ds1302_start(void)
66{
67 set_dp(get_dp() | RTC_RESET);
68}
69
70static inline void ds1302_stop(void)
71{
72 set_dp(get_dp() & ~RTC_RESET);
73}
74
75static inline void ds1302_txbit(int bit)
76{
77 set_dp((get_dp() & ~RTC_IODATA) | (bit ? RTC_IODATA : 0));
78}
79
80static inline int ds1302_rxbit(void)
81{
82 return !!(get_dp() & RTC_IODATA);
83}
84
45#else 85#else
46#error "Add support for your platform" 86#error "Add support for your platform"
47#endif 87#endif
@@ -50,11 +90,11 @@ static void ds1302_sendbits(unsigned int val)
50{ 90{
51 int i; 91 int i;
52 92
93 ds1302_set_tx();
94
53 for (i = 8; (i); i--, val >>= 1) { 95 for (i = 8; (i); i--, val >>= 1) {
54 set_dp((get_dp() & ~RTC_IODATA) | ((val & 0x1) ? 96 ds1302_txbit(val & 0x1);
55 RTC_IODATA : 0)); 97 ds1302_clock();
56 set_dp(get_dp() | RTC_SCLK); /* clock high */
57 set_dp(get_dp() & ~RTC_SCLK); /* clock low */
58 } 98 }
59} 99}
60 100
@@ -63,10 +103,11 @@ static unsigned int ds1302_recvbits(void)
63 unsigned int val; 103 unsigned int val;
64 int i; 104 int i;
65 105
106 ds1302_set_rx();
107
66 for (i = 0, val = 0; (i < 8); i++) { 108 for (i = 0, val = 0; (i < 8); i++) {
67 val |= (((get_dp() & RTC_IODATA) ? 1 : 0) << i); 109 val |= (ds1302_rxbit() << i);
68 set_dp(get_dp() | RTC_SCLK); /* clock high */ 110 ds1302_clock();
69 set_dp(get_dp() & ~RTC_SCLK); /* clock low */
70 } 111 }
71 112
72 return val; 113 return val;
@@ -76,23 +117,24 @@ static unsigned int ds1302_readbyte(unsigned int addr)
76{ 117{
77 unsigned int val; 118 unsigned int val;
78 119
79 set_dp(get_dp() & ~(RTC_RESET | RTC_IODATA | RTC_SCLK)); 120 ds1302_reset();
80 121
81 set_dp(get_dp() | RTC_RESET); 122 ds1302_start();
82 ds1302_sendbits(((addr & 0x3f) << 1) | RTC_CMD_READ); 123 ds1302_sendbits(((addr & 0x3f) << 1) | RTC_CMD_READ);
83 val = ds1302_recvbits(); 124 val = ds1302_recvbits();
84 set_dp(get_dp() & ~RTC_RESET); 125 ds1302_stop();
85 126
86 return val; 127 return val;
87} 128}
88 129
89static void ds1302_writebyte(unsigned int addr, unsigned int val) 130static void ds1302_writebyte(unsigned int addr, unsigned int val)
90{ 131{
91 set_dp(get_dp() & ~(RTC_RESET | RTC_IODATA | RTC_SCLK)); 132 ds1302_reset();
92 set_dp(get_dp() | RTC_RESET); 133
134 ds1302_start();
93 ds1302_sendbits(((addr & 0x3f) << 1) | RTC_CMD_WRITE); 135 ds1302_sendbits(((addr & 0x3f) << 1) | RTC_CMD_WRITE);
94 ds1302_sendbits(val); 136 ds1302_sendbits(val);
95 set_dp(get_dp() & ~RTC_RESET); 137 ds1302_stop();
96} 138}
97 139
98static int ds1302_rtc_read_time(struct device *dev, struct rtc_time *tm) 140static int ds1302_rtc_read_time(struct device *dev, struct rtc_time *tm)
@@ -167,13 +209,20 @@ static int __init ds1302_rtc_probe(struct platform_device *pdev)
167{ 209{
168 struct rtc_device *rtc; 210 struct rtc_device *rtc;
169 211
212 if (ds1302_hw_init()) {
213 dev_err(&pdev->dev, "Failed to init communication channel");
214 return -EINVAL;
215 }
216
170 /* Reset */ 217 /* Reset */
171 set_dp(get_dp() & ~(RTC_RESET | RTC_IODATA | RTC_SCLK)); 218 ds1302_reset();
172 219
173 /* Write a magic value to the DS1302 RAM, and see if it sticks. */ 220 /* Write a magic value to the DS1302 RAM, and see if it sticks. */
174 ds1302_writebyte(RTC_ADDR_RAM0, 0x42); 221 ds1302_writebyte(RTC_ADDR_RAM0, 0x42);
175 if (ds1302_readbyte(RTC_ADDR_RAM0) != 0x42) 222 if (ds1302_readbyte(RTC_ADDR_RAM0) != 0x42) {
223 dev_err(&pdev->dev, "Failed to probe");
176 return -ENODEV; 224 return -ENODEV;
225 }
177 226
178 rtc = rtc_device_register("ds1302", &pdev->dev, 227 rtc = rtc_device_register("ds1302", &pdev->dev,
179 &ds1302_rtc_ops, THIS_MODULE); 228 &ds1302_rtc_ops, THIS_MODULE);
diff --git a/drivers/rtc/rtc-isl1208.c b/drivers/rtc/rtc-isl1208.c
index 054e05294af8..468200c38ecb 100644
--- a/drivers/rtc/rtc-isl1208.c
+++ b/drivers/rtc/rtc-isl1208.c
@@ -462,39 +462,16 @@ isl1208_sysfs_store_usr(struct device *dev,
462static DEVICE_ATTR(usr, S_IRUGO | S_IWUSR, isl1208_sysfs_show_usr, 462static DEVICE_ATTR(usr, S_IRUGO | S_IWUSR, isl1208_sysfs_show_usr,
463 isl1208_sysfs_store_usr); 463 isl1208_sysfs_store_usr);
464 464
465static int 465static struct attribute *isl1208_rtc_attrs[] = {
466isl1208_sysfs_register(struct device *dev) 466 &dev_attr_atrim.attr,
467{ 467 &dev_attr_dtrim.attr,
468 int err; 468 &dev_attr_usr.attr,
469 469 NULL
470 err = device_create_file(dev, &dev_attr_atrim); 470};
471 if (err)
472 return err;
473
474 err = device_create_file(dev, &dev_attr_dtrim);
475 if (err) {
476 device_remove_file(dev, &dev_attr_atrim);
477 return err;
478 }
479
480 err = device_create_file(dev, &dev_attr_usr);
481 if (err) {
482 device_remove_file(dev, &dev_attr_atrim);
483 device_remove_file(dev, &dev_attr_dtrim);
484 }
485
486 return 0;
487}
488
489static int
490isl1208_sysfs_unregister(struct device *dev)
491{
492 device_remove_file(dev, &dev_attr_dtrim);
493 device_remove_file(dev, &dev_attr_atrim);
494 device_remove_file(dev, &dev_attr_usr);
495 471
496 return 0; 472static const struct attribute_group isl1208_rtc_sysfs_files = {
497} 473 .attrs = isl1208_rtc_attrs,
474};
498 475
499static int 476static int
500isl1208_probe(struct i2c_client *client, const struct i2c_device_id *id) 477isl1208_probe(struct i2c_client *client, const struct i2c_device_id *id)
@@ -529,7 +506,7 @@ isl1208_probe(struct i2c_client *client, const struct i2c_device_id *id)
529 dev_warn(&client->dev, "rtc power failure detected, " 506 dev_warn(&client->dev, "rtc power failure detected, "
530 "please set clock.\n"); 507 "please set clock.\n");
531 508
532 rc = isl1208_sysfs_register(&client->dev); 509 rc = sysfs_create_group(&client->dev.kobj, &isl1208_rtc_sysfs_files);
533 if (rc) 510 if (rc)
534 goto exit_unregister; 511 goto exit_unregister;
535 512
@@ -546,7 +523,7 @@ isl1208_remove(struct i2c_client *client)
546{ 523{
547 struct rtc_device *rtc = i2c_get_clientdata(client); 524 struct rtc_device *rtc = i2c_get_clientdata(client);
548 525
549 isl1208_sysfs_unregister(&client->dev); 526 sysfs_remove_group(&client->dev.kobj, &isl1208_rtc_sysfs_files);
550 rtc_device_unregister(rtc); 527 rtc_device_unregister(rtc);
551 528
552 return 0; 529 return 0;
diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c
index d71fe61db1d6..25ec921db07c 100644
--- a/drivers/rtc/rtc-mxc.c
+++ b/drivers/rtc/rtc-mxc.c
@@ -379,7 +379,6 @@ static struct rtc_class_ops mxc_rtc_ops = {
379 379
380static int __init mxc_rtc_probe(struct platform_device *pdev) 380static int __init mxc_rtc_probe(struct platform_device *pdev)
381{ 381{
382 struct clk *clk;
383 struct resource *res; 382 struct resource *res;
384 struct rtc_device *rtc; 383 struct rtc_device *rtc;
385 struct rtc_plat_data *pdata = NULL; 384 struct rtc_plat_data *pdata = NULL;
@@ -402,14 +401,15 @@ static int __init mxc_rtc_probe(struct platform_device *pdev)
402 pdata->ioaddr = devm_ioremap(&pdev->dev, res->start, 401 pdata->ioaddr = devm_ioremap(&pdev->dev, res->start,
403 resource_size(res)); 402 resource_size(res));
404 403
405 clk = clk_get(&pdev->dev, "ckil"); 404 pdata->clk = clk_get(&pdev->dev, "rtc");
406 if (IS_ERR(clk)) { 405 if (IS_ERR(pdata->clk)) {
407 ret = PTR_ERR(clk); 406 dev_err(&pdev->dev, "unable to get clock!\n");
407 ret = PTR_ERR(pdata->clk);
408 goto exit_free_pdata; 408 goto exit_free_pdata;
409 } 409 }
410 410
411 rate = clk_get_rate(clk); 411 clk_enable(pdata->clk);
412 clk_put(clk); 412 rate = clk_get_rate(pdata->clk);
413 413
414 if (rate == 32768) 414 if (rate == 32768)
415 reg = RTC_INPUT_CLK_32768HZ; 415 reg = RTC_INPUT_CLK_32768HZ;
@@ -420,7 +420,7 @@ static int __init mxc_rtc_probe(struct platform_device *pdev)
420 else { 420 else {
421 dev_err(&pdev->dev, "rtc clock is not valid (%lu)\n", rate); 421 dev_err(&pdev->dev, "rtc clock is not valid (%lu)\n", rate);
422 ret = -EINVAL; 422 ret = -EINVAL;
423 goto exit_free_pdata; 423 goto exit_put_clk;
424 } 424 }
425 425
426 reg |= RTC_ENABLE_BIT; 426 reg |= RTC_ENABLE_BIT;
@@ -428,18 +428,9 @@ static int __init mxc_rtc_probe(struct platform_device *pdev)
428 if (((readw(pdata->ioaddr + RTC_RTCCTL)) & RTC_ENABLE_BIT) == 0) { 428 if (((readw(pdata->ioaddr + RTC_RTCCTL)) & RTC_ENABLE_BIT) == 0) {
429 dev_err(&pdev->dev, "hardware module can't be enabled!\n"); 429 dev_err(&pdev->dev, "hardware module can't be enabled!\n");
430 ret = -EIO; 430 ret = -EIO;
431 goto exit_free_pdata; 431 goto exit_put_clk;
432 }
433
434 pdata->clk = clk_get(&pdev->dev, "rtc");
435 if (IS_ERR(pdata->clk)) {
436 dev_err(&pdev->dev, "unable to get clock!\n");
437 ret = PTR_ERR(pdata->clk);
438 goto exit_free_pdata;
439 } 432 }
440 433
441 clk_enable(pdata->clk);
442
443 rtc = rtc_device_register(pdev->name, &pdev->dev, &mxc_rtc_ops, 434 rtc = rtc_device_register(pdev->name, &pdev->dev, &mxc_rtc_ops,
444 THIS_MODULE); 435 THIS_MODULE);
445 if (IS_ERR(rtc)) { 436 if (IS_ERR(rtc)) {
diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c
index 4969b6059c89..e5972b2c17b7 100644
--- a/drivers/rtc/rtc-s3c.c
+++ b/drivers/rtc/rtc-s3c.c
@@ -29,6 +29,11 @@
29#include <asm/irq.h> 29#include <asm/irq.h>
30#include <plat/regs-rtc.h> 30#include <plat/regs-rtc.h>
31 31
32enum s3c_cpu_type {
33 TYPE_S3C2410,
34 TYPE_S3C64XX,
35};
36
32/* I have yet to find an S3C implementation with more than one 37/* I have yet to find an S3C implementation with more than one
33 * of these rtc blocks in */ 38 * of these rtc blocks in */
34 39
@@ -37,6 +42,7 @@ static struct resource *s3c_rtc_mem;
37static void __iomem *s3c_rtc_base; 42static void __iomem *s3c_rtc_base;
38static int s3c_rtc_alarmno = NO_IRQ; 43static int s3c_rtc_alarmno = NO_IRQ;
39static int s3c_rtc_tickno = NO_IRQ; 44static int s3c_rtc_tickno = NO_IRQ;
45static enum s3c_cpu_type s3c_rtc_cpu_type;
40 46
41static DEFINE_SPINLOCK(s3c_rtc_pie_lock); 47static DEFINE_SPINLOCK(s3c_rtc_pie_lock);
42 48
@@ -80,12 +86,25 @@ static int s3c_rtc_setpie(struct device *dev, int enabled)
80 pr_debug("%s: pie=%d\n", __func__, enabled); 86 pr_debug("%s: pie=%d\n", __func__, enabled);
81 87
82 spin_lock_irq(&s3c_rtc_pie_lock); 88 spin_lock_irq(&s3c_rtc_pie_lock);
83 tmp = readb(s3c_rtc_base + S3C2410_TICNT) & ~S3C2410_TICNT_ENABLE;
84 89
85 if (enabled) 90 if (s3c_rtc_cpu_type == TYPE_S3C64XX) {
86 tmp |= S3C2410_TICNT_ENABLE; 91 tmp = readb(s3c_rtc_base + S3C2410_RTCCON);
92 tmp &= ~S3C64XX_RTCCON_TICEN;
93
94 if (enabled)
95 tmp |= S3C64XX_RTCCON_TICEN;
96
97 writeb(tmp, s3c_rtc_base + S3C2410_RTCCON);
98 } else {
99 tmp = readb(s3c_rtc_base + S3C2410_TICNT);
100 tmp &= ~S3C2410_TICNT_ENABLE;
101
102 if (enabled)
103 tmp |= S3C2410_TICNT_ENABLE;
104
105 writeb(tmp, s3c_rtc_base + S3C2410_TICNT);
106 }
87 107
88 writeb(tmp, s3c_rtc_base + S3C2410_TICNT);
89 spin_unlock_irq(&s3c_rtc_pie_lock); 108 spin_unlock_irq(&s3c_rtc_pie_lock);
90 109
91 return 0; 110 return 0;
@@ -93,15 +112,21 @@ static int s3c_rtc_setpie(struct device *dev, int enabled)
93 112
94static int s3c_rtc_setfreq(struct device *dev, int freq) 113static int s3c_rtc_setfreq(struct device *dev, int freq)
95{ 114{
96 unsigned int tmp; 115 struct platform_device *pdev = to_platform_device(dev);
116 struct rtc_device *rtc_dev = platform_get_drvdata(pdev);
117 unsigned int tmp = 0;
97 118
98 if (!is_power_of_2(freq)) 119 if (!is_power_of_2(freq))
99 return -EINVAL; 120 return -EINVAL;
100 121
101 spin_lock_irq(&s3c_rtc_pie_lock); 122 spin_lock_irq(&s3c_rtc_pie_lock);
102 123
103 tmp = readb(s3c_rtc_base + S3C2410_TICNT) & S3C2410_TICNT_ENABLE; 124 if (s3c_rtc_cpu_type == TYPE_S3C2410) {
104 tmp |= (128 / freq)-1; 125 tmp = readb(s3c_rtc_base + S3C2410_TICNT);
126 tmp &= S3C2410_TICNT_ENABLE;
127 }
128
129 tmp |= (rtc_dev->max_user_freq / freq)-1;
105 130
106 writeb(tmp, s3c_rtc_base + S3C2410_TICNT); 131 writeb(tmp, s3c_rtc_base + S3C2410_TICNT);
107 spin_unlock_irq(&s3c_rtc_pie_lock); 132 spin_unlock_irq(&s3c_rtc_pie_lock);
@@ -283,10 +308,17 @@ static int s3c_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
283 308
284static int s3c_rtc_proc(struct device *dev, struct seq_file *seq) 309static int s3c_rtc_proc(struct device *dev, struct seq_file *seq)
285{ 310{
286 unsigned int ticnt = readb(s3c_rtc_base + S3C2410_TICNT); 311 unsigned int ticnt;
287 312
288 seq_printf(seq, "periodic_IRQ\t: %s\n", 313 if (s3c_rtc_cpu_type == TYPE_S3C64XX) {
289 (ticnt & S3C2410_TICNT_ENABLE) ? "yes" : "no" ); 314 ticnt = readb(s3c_rtc_base + S3C2410_RTCCON);
315 ticnt &= S3C64XX_RTCCON_TICEN;
316 } else {
317 ticnt = readb(s3c_rtc_base + S3C2410_TICNT);
318 ticnt &= S3C2410_TICNT_ENABLE;
319 }
320
321 seq_printf(seq, "periodic_IRQ\t: %s\n", ticnt ? "yes" : "no");
290 return 0; 322 return 0;
291} 323}
292 324
@@ -353,10 +385,16 @@ static void s3c_rtc_enable(struct platform_device *pdev, int en)
353 385
354 if (!en) { 386 if (!en) {
355 tmp = readb(base + S3C2410_RTCCON); 387 tmp = readb(base + S3C2410_RTCCON);
356 writeb(tmp & ~S3C2410_RTCCON_RTCEN, base + S3C2410_RTCCON); 388 if (s3c_rtc_cpu_type == TYPE_S3C64XX)
357 389 tmp &= ~S3C64XX_RTCCON_TICEN;
358 tmp = readb(base + S3C2410_TICNT); 390 tmp &= ~S3C2410_RTCCON_RTCEN;
359 writeb(tmp & ~S3C2410_TICNT_ENABLE, base + S3C2410_TICNT); 391 writeb(tmp, base + S3C2410_RTCCON);
392
393 if (s3c_rtc_cpu_type == TYPE_S3C2410) {
394 tmp = readb(base + S3C2410_TICNT);
395 tmp &= ~S3C2410_TICNT_ENABLE;
396 writeb(tmp, base + S3C2410_TICNT);
397 }
360 } else { 398 } else {
361 /* re-enable the device, and check it is ok */ 399 /* re-enable the device, and check it is ok */
362 400
@@ -472,7 +510,12 @@ static int __devinit s3c_rtc_probe(struct platform_device *pdev)
472 goto err_nortc; 510 goto err_nortc;
473 } 511 }
474 512
475 rtc->max_user_freq = 128; 513 if (s3c_rtc_cpu_type == TYPE_S3C64XX)
514 rtc->max_user_freq = 32768;
515 else
516 rtc->max_user_freq = 128;
517
518 s3c_rtc_cpu_type = platform_get_device_id(pdev)->driver_data;
476 519
477 platform_set_drvdata(pdev, rtc); 520 platform_set_drvdata(pdev, rtc);
478 return 0; 521 return 0;
@@ -492,20 +535,30 @@ static int __devinit s3c_rtc_probe(struct platform_device *pdev)
492 535
493/* RTC Power management control */ 536/* RTC Power management control */
494 537
495static int ticnt_save; 538static int ticnt_save, ticnt_en_save;
496 539
497static int s3c_rtc_suspend(struct platform_device *pdev, pm_message_t state) 540static int s3c_rtc_suspend(struct platform_device *pdev, pm_message_t state)
498{ 541{
499 /* save TICNT for anyone using periodic interrupts */ 542 /* save TICNT for anyone using periodic interrupts */
500 ticnt_save = readb(s3c_rtc_base + S3C2410_TICNT); 543 ticnt_save = readb(s3c_rtc_base + S3C2410_TICNT);
544 if (s3c_rtc_cpu_type == TYPE_S3C64XX) {
545 ticnt_en_save = readb(s3c_rtc_base + S3C2410_RTCCON);
546 ticnt_en_save &= S3C64XX_RTCCON_TICEN;
547 }
501 s3c_rtc_enable(pdev, 0); 548 s3c_rtc_enable(pdev, 0);
502 return 0; 549 return 0;
503} 550}
504 551
505static int s3c_rtc_resume(struct platform_device *pdev) 552static int s3c_rtc_resume(struct platform_device *pdev)
506{ 553{
554 unsigned int tmp;
555
507 s3c_rtc_enable(pdev, 1); 556 s3c_rtc_enable(pdev, 1);
508 writeb(ticnt_save, s3c_rtc_base + S3C2410_TICNT); 557 writeb(ticnt_save, s3c_rtc_base + S3C2410_TICNT);
558 if (s3c_rtc_cpu_type == TYPE_S3C64XX && ticnt_en_save) {
559 tmp = readb(s3c_rtc_base + S3C2410_RTCCON);
560 writeb(tmp | ticnt_en_save, s3c_rtc_base + S3C2410_RTCCON);
561 }
509 return 0; 562 return 0;
510} 563}
511#else 564#else
@@ -513,13 +566,27 @@ static int s3c_rtc_resume(struct platform_device *pdev)
513#define s3c_rtc_resume NULL 566#define s3c_rtc_resume NULL
514#endif 567#endif
515 568
516static struct platform_driver s3c2410_rtc_driver = { 569static struct platform_device_id s3c_rtc_driver_ids[] = {
570 {
571 .name = "s3c2410-rtc",
572 .driver_data = TYPE_S3C2410,
573 }, {
574 .name = "s3c64xx-rtc",
575 .driver_data = TYPE_S3C64XX,
576 },
577 { }
578};
579
580MODULE_DEVICE_TABLE(platform, s3c_rtc_driver_ids);
581
582static struct platform_driver s3c_rtc_driver = {
517 .probe = s3c_rtc_probe, 583 .probe = s3c_rtc_probe,
518 .remove = __devexit_p(s3c_rtc_remove), 584 .remove = __devexit_p(s3c_rtc_remove),
519 .suspend = s3c_rtc_suspend, 585 .suspend = s3c_rtc_suspend,
520 .resume = s3c_rtc_resume, 586 .resume = s3c_rtc_resume,
587 .id_table = s3c_rtc_driver_ids,
521 .driver = { 588 .driver = {
522 .name = "s3c2410-rtc", 589 .name = "s3c-rtc",
523 .owner = THIS_MODULE, 590 .owner = THIS_MODULE,
524 }, 591 },
525}; 592};
@@ -529,12 +596,12 @@ static char __initdata banner[] = "S3C24XX RTC, (c) 2004,2006 Simtec Electronics
529static int __init s3c_rtc_init(void) 596static int __init s3c_rtc_init(void)
530{ 597{
531 printk(banner); 598 printk(banner);
532 return platform_driver_register(&s3c2410_rtc_driver); 599 return platform_driver_register(&s3c_rtc_driver);
533} 600}
534 601
535static void __exit s3c_rtc_exit(void) 602static void __exit s3c_rtc_exit(void)
536{ 603{
537 platform_driver_unregister(&s3c2410_rtc_driver); 604 platform_driver_unregister(&s3c_rtc_driver);
538} 605}
539 606
540module_init(s3c_rtc_init); 607module_init(s3c_rtc_init);
diff --git a/drivers/rtc/rtc-wm831x.c b/drivers/rtc/rtc-wm831x.c
index b16cfe57a484..82931dc65c0b 100644
--- a/drivers/rtc/rtc-wm831x.c
+++ b/drivers/rtc/rtc-wm831x.c
@@ -449,17 +449,17 @@ static int wm831x_rtc_probe(struct platform_device *pdev)
449 goto err; 449 goto err;
450 } 450 }
451 451
452 ret = wm831x_request_irq(wm831x, per_irq, wm831x_per_irq, 452 ret = request_threaded_irq(per_irq, NULL, wm831x_per_irq,
453 IRQF_TRIGGER_RISING, "wm831x_rtc_per", 453 IRQF_TRIGGER_RISING, "RTC period",
454 wm831x_rtc); 454 wm831x_rtc);
455 if (ret != 0) { 455 if (ret != 0) {
456 dev_err(&pdev->dev, "Failed to request periodic IRQ %d: %d\n", 456 dev_err(&pdev->dev, "Failed to request periodic IRQ %d: %d\n",
457 per_irq, ret); 457 per_irq, ret);
458 } 458 }
459 459
460 ret = wm831x_request_irq(wm831x, alm_irq, wm831x_alm_irq, 460 ret = request_threaded_irq(alm_irq, NULL, wm831x_alm_irq,
461 IRQF_TRIGGER_RISING, "wm831x_rtc_alm", 461 IRQF_TRIGGER_RISING, "RTC alarm",
462 wm831x_rtc); 462 wm831x_rtc);
463 if (ret != 0) { 463 if (ret != 0) {
464 dev_err(&pdev->dev, "Failed to request alarm IRQ %d: %d\n", 464 dev_err(&pdev->dev, "Failed to request alarm IRQ %d: %d\n",
465 alm_irq, ret); 465 alm_irq, ret);
@@ -478,8 +478,8 @@ static int __devexit wm831x_rtc_remove(struct platform_device *pdev)
478 int per_irq = platform_get_irq_byname(pdev, "PER"); 478 int per_irq = platform_get_irq_byname(pdev, "PER");
479 int alm_irq = platform_get_irq_byname(pdev, "ALM"); 479 int alm_irq = platform_get_irq_byname(pdev, "ALM");
480 480
481 wm831x_free_irq(wm831x_rtc->wm831x, alm_irq, wm831x_rtc); 481 free_irq(alm_irq, wm831x_rtc);
482 wm831x_free_irq(wm831x_rtc->wm831x, per_irq, wm831x_rtc); 482 free_irq(per_irq, wm831x_rtc);
483 rtc_device_unregister(wm831x_rtc->rtc); 483 rtc_device_unregister(wm831x_rtc->rtc);
484 kfree(wm831x_rtc); 484 kfree(wm831x_rtc);
485 485
diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index 9276121db1ef..44a07593de56 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -688,7 +688,7 @@ static int fcoe_shost_config(struct fc_lport *lport, struct device *dev)
688 } 688 }
689 689
690 if (!lport->vport) 690 if (!lport->vport)
691 fc_host_max_npiv_vports(lport->host) = USHORT_MAX; 691 fc_host_max_npiv_vports(lport->host) = USHRT_MAX;
692 692
693 snprintf(fc_host_symbolic_name(lport->host), FC_SYMBOLIC_NAME_SIZE, 693 snprintf(fc_host_symbolic_name(lport->host), FC_SYMBOLIC_NAME_SIZE,
694 "%s v%s over %s", FCOE_NAME, FCOE_VERSION, 694 "%s v%s over %s", FCOE_NAME, FCOE_VERSION,
diff --git a/drivers/scsi/mpt2sas/mpt2sas_base.c b/drivers/scsi/mpt2sas/mpt2sas_base.c
index b830d61684dd..0ec1ed389c20 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_base.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_base.c
@@ -3757,7 +3757,7 @@ _base_reset_handler(struct MPT2SAS_ADAPTER *ioc, int reset_phase)
3757 if (ioc->config_cmds.status & MPT2_CMD_PENDING) { 3757 if (ioc->config_cmds.status & MPT2_CMD_PENDING) {
3758 ioc->config_cmds.status |= MPT2_CMD_RESET; 3758 ioc->config_cmds.status |= MPT2_CMD_RESET;
3759 mpt2sas_base_free_smid(ioc, ioc->config_cmds.smid); 3759 mpt2sas_base_free_smid(ioc, ioc->config_cmds.smid);
3760 ioc->config_cmds.smid = USHORT_MAX; 3760 ioc->config_cmds.smid = USHRT_MAX;
3761 complete(&ioc->config_cmds.done); 3761 complete(&ioc->config_cmds.done);
3762 } 3762 }
3763 break; 3763 break;
diff --git a/drivers/scsi/mpt2sas/mpt2sas_config.c b/drivers/scsi/mpt2sas/mpt2sas_config.c
index e762dd3e2fcb..c65442982d7b 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_config.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_config.c
@@ -258,7 +258,7 @@ mpt2sas_config_done(struct MPT2SAS_ADAPTER *ioc, u16 smid, u8 msix_index,
258#ifdef CONFIG_SCSI_MPT2SAS_LOGGING 258#ifdef CONFIG_SCSI_MPT2SAS_LOGGING
259 _config_display_some_debug(ioc, smid, "config_done", mpi_reply); 259 _config_display_some_debug(ioc, smid, "config_done", mpi_reply);
260#endif 260#endif
261 ioc->config_cmds.smid = USHORT_MAX; 261 ioc->config_cmds.smid = USHRT_MAX;
262 complete(&ioc->config_cmds.done); 262 complete(&ioc->config_cmds.done);
263 return 1; 263 return 1;
264} 264}
diff --git a/drivers/serial/68328serial.c b/drivers/serial/68328serial.c
index 78ed24bb6a35..30463862603b 100644
--- a/drivers/serial/68328serial.c
+++ b/drivers/serial/68328serial.c
@@ -1437,7 +1437,7 @@ int m68328_console_setup(struct console *cp, char *arg)
1437 for (i = 0; i < ARRAY_SIZE(baud_table); i++) 1437 for (i = 0; i < ARRAY_SIZE(baud_table); i++)
1438 if (baud_table[i] == n) 1438 if (baud_table[i] == n)
1439 break; 1439 break;
1440 if (i < BAUD_TABLE_SIZE) { 1440 if (i < ARRAY_SIZE(baud_table)) {
1441 m68328_console_baud = n; 1441 m68328_console_baud = n;
1442 m68328_console_cbaud = 0; 1442 m68328_console_cbaud = 0;
1443 if (i > 15) { 1443 if (i > 15) {
diff --git a/drivers/staging/rt2860/common/rtmp_init.c b/drivers/staging/rt2860/common/rtmp_init.c
index 21a95ffdfb86..a09038542f26 100644
--- a/drivers/staging/rt2860/common/rtmp_init.c
+++ b/drivers/staging/rt2860/common/rtmp_init.c
@@ -2810,17 +2810,6 @@ void UserCfgInit(struct rt_rtmp_adapter *pAd)
2810} 2810}
2811 2811
2812/* IRQL = PASSIVE_LEVEL */ 2812/* IRQL = PASSIVE_LEVEL */
2813u8 BtoH(char ch)
2814{
2815 if (ch >= '0' && ch <= '9')
2816 return (ch - '0'); /* Handle numerals */
2817 if (ch >= 'A' && ch <= 'F')
2818 return (ch - 'A' + 0xA); /* Handle capitol hex digits */
2819 if (ch >= 'a' && ch <= 'f')
2820 return (ch - 'a' + 0xA); /* Handle small hex digits */
2821 return (255);
2822}
2823
2824/* */ 2813/* */
2825/* FUNCTION: AtoH(char *, u8 *, int) */ 2814/* FUNCTION: AtoH(char *, u8 *, int) */
2826/* */ 2815/* */
@@ -2847,8 +2836,8 @@ void AtoH(char *src, u8 *dest, int destlen)
2847 destTemp = (u8 *)dest; 2836 destTemp = (u8 *)dest;
2848 2837
2849 while (destlen--) { 2838 while (destlen--) {
2850 *destTemp = BtoH(*srcptr++) << 4; /* Put 1st ascii byte in upper nibble. */ 2839 *destTemp = hex_to_bin(*srcptr++) << 4; /* Put 1st ascii byte in upper nibble. */
2851 *destTemp += BtoH(*srcptr++); /* Add 2nd ascii byte to above. */ 2840 *destTemp += hex_to_bin(*srcptr++); /* Add 2nd ascii byte to above. */
2852 destTemp++; 2841 destTemp++;
2853 } 2842 }
2854} 2843}
diff --git a/drivers/staging/rt2860/rtmp.h b/drivers/staging/rt2860/rtmp.h
index ab525ee15042..82b6e783b33f 100644
--- a/drivers/staging/rt2860/rtmp.h
+++ b/drivers/staging/rt2860/rtmp.h
@@ -2356,8 +2356,6 @@ void RTMPMoveMemory(void *pDest, void *pSrc, unsigned long Length);
2356 2356
2357void AtoH(char *src, u8 *dest, int destlen); 2357void AtoH(char *src, u8 *dest, int destlen);
2358 2358
2359u8 BtoH(char ch);
2360
2361void RTMPPatchMacBbpBug(struct rt_rtmp_adapter *pAd); 2359void RTMPPatchMacBbpBug(struct rt_rtmp_adapter *pAd);
2362 2360
2363void RTMPInitTimer(struct rt_rtmp_adapter *pAd, 2361void RTMPInitTimer(struct rt_rtmp_adapter *pAd,
diff --git a/drivers/usb/atm/speedtch.c b/drivers/usb/atm/speedtch.c
index 1e9ba4bdffef..1335456b4f93 100644
--- a/drivers/usb/atm/speedtch.c
+++ b/drivers/usb/atm/speedtch.c
@@ -127,8 +127,6 @@ MODULE_PARM_DESC(ModemOption, "default: 0x10,0x00,0x00,0x00,0x20");
127#define ENDPOINT_ISOC_DATA 0x07 127#define ENDPOINT_ISOC_DATA 0x07
128#define ENDPOINT_FIRMWARE 0x05 128#define ENDPOINT_FIRMWARE 0x05
129 129
130#define hex2int(c) ( (c >= '0') && (c <= '9') ? (c - '0') : ((c & 0xf) + 9) )
131
132struct speedtch_params { 130struct speedtch_params {
133 unsigned int altsetting; 131 unsigned int altsetting;
134 unsigned int BMaxDSL; 132 unsigned int BMaxDSL;
@@ -669,7 +667,8 @@ static int speedtch_atm_start(struct usbatm_data *usbatm, struct atm_dev *atm_de
669 memset(atm_dev->esi, 0, sizeof(atm_dev->esi)); 667 memset(atm_dev->esi, 0, sizeof(atm_dev->esi));
670 if (usb_string(usb_dev, usb_dev->descriptor.iSerialNumber, mac_str, sizeof(mac_str)) == 12) { 668 if (usb_string(usb_dev, usb_dev->descriptor.iSerialNumber, mac_str, sizeof(mac_str)) == 12) {
671 for (i = 0; i < 6; i++) 669 for (i = 0; i < 6; i++)
672 atm_dev->esi[i] = (hex2int(mac_str[i * 2]) * 16) + (hex2int(mac_str[i * 2 + 1])); 670 atm_dev->esi[i] = (hex_to_bin(mac_str[i * 2]) << 4) +
671 hex_to_bin(mac_str[i * 2 + 1]);
673 } 672 }
674 673
675 /* Start modem synchronisation */ 674 /* Start modem synchronisation */
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 750effe0f98b..c6fb8e968f21 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -806,7 +806,7 @@ static unsigned get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq,
806 count = indirect->len / sizeof desc; 806 count = indirect->len / sizeof desc;
807 /* Buffers are chained via a 16 bit next field, so 807 /* Buffers are chained via a 16 bit next field, so
808 * we can have at most 2^16 of these. */ 808 * we can have at most 2^16 of these. */
809 if (count > USHORT_MAX + 1) { 809 if (count > USHRT_MAX + 1) {
810 vq_err(vq, "Indirect buffer length too big: %d\n", 810 vq_err(vq, "Indirect buffer length too big: %d\n",
811 indirect->len); 811 indirect->len);
812 return -E2BIG; 812 return -E2BIG;
diff --git a/drivers/video/arcfb.c b/drivers/video/arcfb.c
index 8d406fb689c1..f3d7440f0072 100644
--- a/drivers/video/arcfb.c
+++ b/drivers/video/arcfb.c
@@ -80,7 +80,7 @@ struct arcfb_par {
80 spinlock_t lock; 80 spinlock_t lock;
81}; 81};
82 82
83static struct fb_fix_screeninfo arcfb_fix __initdata = { 83static struct fb_fix_screeninfo arcfb_fix __devinitdata = {
84 .id = "arcfb", 84 .id = "arcfb",
85 .type = FB_TYPE_PACKED_PIXELS, 85 .type = FB_TYPE_PACKED_PIXELS,
86 .visual = FB_VISUAL_MONO01, 86 .visual = FB_VISUAL_MONO01,
@@ -90,7 +90,7 @@ static struct fb_fix_screeninfo arcfb_fix __initdata = {
90 .accel = FB_ACCEL_NONE, 90 .accel = FB_ACCEL_NONE,
91}; 91};
92 92
93static struct fb_var_screeninfo arcfb_var __initdata = { 93static struct fb_var_screeninfo arcfb_var __devinitdata = {
94 .xres = 128, 94 .xres = 128,
95 .yres = 64, 95 .yres = 64,
96 .xres_virtual = 128, 96 .xres_virtual = 128,
@@ -588,7 +588,7 @@ err:
588 return retval; 588 return retval;
589} 589}
590 590
591static int arcfb_remove(struct platform_device *dev) 591static int __devexit arcfb_remove(struct platform_device *dev)
592{ 592{
593 struct fb_info *info = platform_get_drvdata(dev); 593 struct fb_info *info = platform_get_drvdata(dev);
594 594
@@ -602,7 +602,7 @@ static int arcfb_remove(struct platform_device *dev)
602 602
603static struct platform_driver arcfb_driver = { 603static struct platform_driver arcfb_driver = {
604 .probe = arcfb_probe, 604 .probe = arcfb_probe,
605 .remove = arcfb_remove, 605 .remove = __devexit_p(arcfb_remove),
606 .driver = { 606 .driver = {
607 .name = "arcfb", 607 .name = "arcfb",
608 }, 608 },
diff --git a/drivers/video/aty/atyfb_base.c b/drivers/video/aty/atyfb_base.c
index 29d72851f85b..f8d69ad36830 100644
--- a/drivers/video/aty/atyfb_base.c
+++ b/drivers/video/aty/atyfb_base.c
@@ -1820,10 +1820,6 @@ struct atyclk {
1820#define ATYIO_FEATW 0x41545903 /* ATY\03 */ 1820#define ATYIO_FEATW 0x41545903 /* ATY\03 */
1821#endif 1821#endif
1822 1822
1823#ifndef FBIO_WAITFORVSYNC
1824#define FBIO_WAITFORVSYNC _IOW('F', 0x20, __u32)
1825#endif
1826
1827static int atyfb_ioctl(struct fb_info *info, u_int cmd, u_long arg) 1823static int atyfb_ioctl(struct fb_info *info, u_int cmd, u_long arg)
1828{ 1824{
1829 struct atyfb_par *par = (struct atyfb_par *) info->par; 1825 struct atyfb_par *par = (struct atyfb_par *) info->par;
diff --git a/drivers/video/bfin-lq035q1-fb.c b/drivers/video/bfin-lq035q1-fb.c
index 2baac7cc1425..c8e1f04941bd 100644
--- a/drivers/video/bfin-lq035q1-fb.c
+++ b/drivers/video/bfin-lq035q1-fb.c
@@ -61,47 +61,13 @@
61#define LCD_X_RES 320 /* Horizontal Resolution */ 61#define LCD_X_RES 320 /* Horizontal Resolution */
62#define LCD_Y_RES 240 /* Vertical Resolution */ 62#define LCD_Y_RES 240 /* Vertical Resolution */
63#define DMA_BUS_SIZE 16 63#define DMA_BUS_SIZE 16
64#define U_LINE 4 /* Blanking Lines */
64 65
65#define USE_RGB565_16_BIT_PPI
66
67#ifdef USE_RGB565_16_BIT_PPI
68#define LCD_BPP 16 /* Bit Per Pixel */
69#define CLOCKS_PER_PIX 1
70#define CPLD_PIPELINE_DELAY_COR 0 /* NO CPLB */
71#endif
72 66
73/* Interface 16/18-bit TFT over an 8-bit wide PPI using a small Programmable Logic Device (CPLD) 67/* Interface 16/18-bit TFT over an 8-bit wide PPI using a small Programmable Logic Device (CPLD)
74 * http://blackfin.uclinux.org/gf/project/stamp/frs/?action=FrsReleaseBrowse&frs_package_id=165 68 * http://blackfin.uclinux.org/gf/project/stamp/frs/?action=FrsReleaseBrowse&frs_package_id=165
75 */ 69 */
76 70
77#ifdef USE_RGB565_8_BIT_PPI
78#define LCD_BPP 16 /* Bit Per Pixel */
79#define CLOCKS_PER_PIX 2
80#define CPLD_PIPELINE_DELAY_COR 3 /* RGB565 */
81#endif
82
83#ifdef USE_RGB888_8_BIT_PPI
84#define LCD_BPP 24 /* Bit Per Pixel */
85#define CLOCKS_PER_PIX 3
86#define CPLD_PIPELINE_DELAY_COR 5 /* RGB888 */
87#endif
88
89 /*
90 * HS and VS timing parameters (all in number of PPI clk ticks)
91 */
92
93#define U_LINE 4 /* Blanking Lines */
94
95#define H_ACTPIX (LCD_X_RES * CLOCKS_PER_PIX) /* active horizontal pixel */
96#define H_PERIOD (336 * CLOCKS_PER_PIX) /* HS period */
97#define H_PULSE (2 * CLOCKS_PER_PIX) /* HS pulse width */
98#define H_START (7 * CLOCKS_PER_PIX + CPLD_PIPELINE_DELAY_COR) /* first valid pixel */
99
100#define V_LINES (LCD_Y_RES + U_LINE) /* total vertical lines */
101#define V_PULSE (2 * CLOCKS_PER_PIX) /* VS pulse width (1-5 H_PERIODs) */
102#define V_PERIOD (H_PERIOD * V_LINES) /* VS period */
103
104#define ACTIVE_VIDEO_MEM_OFFSET ((U_LINE / 2) * LCD_X_RES * (LCD_BPP / 8))
105 71
106#define BFIN_LCD_NBR_PALETTE_ENTRIES 256 72#define BFIN_LCD_NBR_PALETTE_ENTRIES 256
107 73
@@ -110,12 +76,6 @@
110#define PPI_PORT_CFG_01 0x10 76#define PPI_PORT_CFG_01 0x10
111#define PPI_POLS_1 0x8000 77#define PPI_POLS_1 0x8000
112 78
113#if (CLOCKS_PER_PIX > 1)
114#define PPI_PMODE (DLEN_8 | PACK_EN)
115#else
116#define PPI_PMODE (DLEN_16)
117#endif
118
119#define LQ035_INDEX 0x74 79#define LQ035_INDEX 0x74
120#define LQ035_DATA 0x76 80#define LQ035_DATA 0x76
121 81
@@ -139,6 +99,15 @@ struct bfin_lq035q1fb_info {
139 int irq; 99 int irq;
140 spinlock_t lock; /* lock */ 100 spinlock_t lock; /* lock */
141 u32 pseudo_pal[16]; 101 u32 pseudo_pal[16];
102
103 u32 lcd_bpp;
104 u32 h_actpix;
105 u32 h_period;
106 u32 h_pulse;
107 u32 h_start;
108 u32 v_lines;
109 u32 v_pulse;
110 u32 v_period;
142}; 111};
143 112
144static int nocursor; 113static int nocursor;
@@ -234,16 +203,69 @@ static int lq035q1_backlight(struct bfin_lq035q1fb_info *info, unsigned arg)
234 return 0; 203 return 0;
235} 204}
236 205
206static int bfin_lq035q1_calc_timing(struct bfin_lq035q1fb_info *fbi)
207{
208 unsigned long clocks_per_pix, cpld_pipeline_delay_cor;
209
210 /*
211 * Interface 16/18-bit TFT over an 8-bit wide PPI using a small
212 * Programmable Logic Device (CPLD)
213 * http://blackfin.uclinux.org/gf/project/stamp/frs/?action=FrsReleaseBrowse&frs_package_id=165
214 */
215
216 switch (fbi->disp_info->ppi_mode) {
217 case USE_RGB565_16_BIT_PPI:
218 fbi->lcd_bpp = 16;
219 clocks_per_pix = 1;
220 cpld_pipeline_delay_cor = 0;
221 break;
222 case USE_RGB565_8_BIT_PPI:
223 fbi->lcd_bpp = 16;
224 clocks_per_pix = 2;
225 cpld_pipeline_delay_cor = 3;
226 break;
227 case USE_RGB888_8_BIT_PPI:
228 fbi->lcd_bpp = 24;
229 clocks_per_pix = 3;
230 cpld_pipeline_delay_cor = 5;
231 break;
232 default:
233 return -EINVAL;
234 }
235
236 /*
237 * HS and VS timing parameters (all in number of PPI clk ticks)
238 */
239
240 fbi->h_actpix = (LCD_X_RES * clocks_per_pix); /* active horizontal pixel */
241 fbi->h_period = (336 * clocks_per_pix); /* HS period */
242 fbi->h_pulse = (2 * clocks_per_pix); /* HS pulse width */
243 fbi->h_start = (7 * clocks_per_pix + cpld_pipeline_delay_cor); /* first valid pixel */
244
245 fbi->v_lines = (LCD_Y_RES + U_LINE); /* total vertical lines */
246 fbi->v_pulse = (2 * clocks_per_pix); /* VS pulse width (1-5 H_PERIODs) */
247 fbi->v_period = (fbi->h_period * fbi->v_lines); /* VS period */
248
249 return 0;
250}
251
237static void bfin_lq035q1_config_ppi(struct bfin_lq035q1fb_info *fbi) 252static void bfin_lq035q1_config_ppi(struct bfin_lq035q1fb_info *fbi)
238{ 253{
239 bfin_write_PPI_DELAY(H_START); 254 unsigned ppi_pmode;
240 bfin_write_PPI_COUNT(H_ACTPIX - 1); 255
241 bfin_write_PPI_FRAME(V_LINES); 256 if (fbi->disp_info->ppi_mode == USE_RGB565_16_BIT_PPI)
257 ppi_pmode = DLEN_16;
258 else
259 ppi_pmode = (DLEN_8 | PACK_EN);
260
261 bfin_write_PPI_DELAY(fbi->h_start);
262 bfin_write_PPI_COUNT(fbi->h_actpix - 1);
263 bfin_write_PPI_FRAME(fbi->v_lines);
242 264
243 bfin_write_PPI_CONTROL(PPI_TX_MODE | /* output mode , PORT_DIR */ 265 bfin_write_PPI_CONTROL(PPI_TX_MODE | /* output mode , PORT_DIR */
244 PPI_XFER_TYPE_11 | /* sync mode XFR_TYPE */ 266 PPI_XFER_TYPE_11 | /* sync mode XFR_TYPE */
245 PPI_PORT_CFG_01 | /* two frame sync PORT_CFG */ 267 PPI_PORT_CFG_01 | /* two frame sync PORT_CFG */
246 PPI_PMODE | /* 8/16 bit data length / PACK_EN? */ 268 ppi_pmode | /* 8/16 bit data length / PACK_EN? */
247 PPI_POLS_1); /* faling edge syncs POLS */ 269 PPI_POLS_1); /* faling edge syncs POLS */
248} 270}
249 271
@@ -272,19 +294,19 @@ static void bfin_lq035q1_stop_timers(void)
272 294
273} 295}
274 296
275static void bfin_lq035q1_init_timers(void) 297static void bfin_lq035q1_init_timers(struct bfin_lq035q1fb_info *fbi)
276{ 298{
277 299
278 bfin_lq035q1_stop_timers(); 300 bfin_lq035q1_stop_timers();
279 301
280 set_gptimer_period(TIMER_HSYNC_id, H_PERIOD); 302 set_gptimer_period(TIMER_HSYNC_id, fbi->h_period);
281 set_gptimer_pwidth(TIMER_HSYNC_id, H_PULSE); 303 set_gptimer_pwidth(TIMER_HSYNC_id, fbi->h_pulse);
282 set_gptimer_config(TIMER_HSYNC_id, TIMER_MODE_PWM | TIMER_PERIOD_CNT | 304 set_gptimer_config(TIMER_HSYNC_id, TIMER_MODE_PWM | TIMER_PERIOD_CNT |
283 TIMER_TIN_SEL | TIMER_CLK_SEL| 305 TIMER_TIN_SEL | TIMER_CLK_SEL|
284 TIMER_EMU_RUN); 306 TIMER_EMU_RUN);
285 307
286 set_gptimer_period(TIMER_VSYNC_id, V_PERIOD); 308 set_gptimer_period(TIMER_VSYNC_id, fbi->v_period);
287 set_gptimer_pwidth(TIMER_VSYNC_id, V_PULSE); 309 set_gptimer_pwidth(TIMER_VSYNC_id, fbi->v_pulse);
288 set_gptimer_config(TIMER_VSYNC_id, TIMER_MODE_PWM | TIMER_PERIOD_CNT | 310 set_gptimer_config(TIMER_VSYNC_id, TIMER_MODE_PWM | TIMER_PERIOD_CNT |
289 TIMER_TIN_SEL | TIMER_CLK_SEL | 311 TIMER_TIN_SEL | TIMER_CLK_SEL |
290 TIMER_EMU_RUN); 312 TIMER_EMU_RUN);
@@ -294,21 +316,21 @@ static void bfin_lq035q1_init_timers(void)
294static void bfin_lq035q1_config_dma(struct bfin_lq035q1fb_info *fbi) 316static void bfin_lq035q1_config_dma(struct bfin_lq035q1fb_info *fbi)
295{ 317{
296 318
319
297 set_dma_config(CH_PPI, 320 set_dma_config(CH_PPI,
298 set_bfin_dma_config(DIR_READ, DMA_FLOW_AUTO, 321 set_bfin_dma_config(DIR_READ, DMA_FLOW_AUTO,
299 INTR_DISABLE, DIMENSION_2D, 322 INTR_DISABLE, DIMENSION_2D,
300 DATA_SIZE_16, 323 DATA_SIZE_16,
301 DMA_NOSYNC_KEEP_DMA_BUF)); 324 DMA_NOSYNC_KEEP_DMA_BUF));
302 set_dma_x_count(CH_PPI, (LCD_X_RES * LCD_BPP) / DMA_BUS_SIZE); 325 set_dma_x_count(CH_PPI, (LCD_X_RES * fbi->lcd_bpp) / DMA_BUS_SIZE);
303 set_dma_x_modify(CH_PPI, DMA_BUS_SIZE / 8); 326 set_dma_x_modify(CH_PPI, DMA_BUS_SIZE / 8);
304 set_dma_y_count(CH_PPI, V_LINES); 327 set_dma_y_count(CH_PPI, fbi->v_lines);
305 328
306 set_dma_y_modify(CH_PPI, DMA_BUS_SIZE / 8); 329 set_dma_y_modify(CH_PPI, DMA_BUS_SIZE / 8);
307 set_dma_start_addr(CH_PPI, (unsigned long)fbi->fb_buffer); 330 set_dma_start_addr(CH_PPI, (unsigned long)fbi->fb_buffer);
308 331
309} 332}
310 333
311#if (CLOCKS_PER_PIX == 1)
312static const u16 ppi0_req_16[] = {P_PPI0_CLK, P_PPI0_FS1, P_PPI0_FS2, 334static const u16 ppi0_req_16[] = {P_PPI0_CLK, P_PPI0_FS1, P_PPI0_FS2,
313 P_PPI0_D0, P_PPI0_D1, P_PPI0_D2, 335 P_PPI0_D0, P_PPI0_D1, P_PPI0_D2,
314 P_PPI0_D3, P_PPI0_D4, P_PPI0_D5, 336 P_PPI0_D3, P_PPI0_D4, P_PPI0_D5,
@@ -316,22 +338,27 @@ static const u16 ppi0_req_16[] = {P_PPI0_CLK, P_PPI0_FS1, P_PPI0_FS2,
316 P_PPI0_D9, P_PPI0_D10, P_PPI0_D11, 338 P_PPI0_D9, P_PPI0_D10, P_PPI0_D11,
317 P_PPI0_D12, P_PPI0_D13, P_PPI0_D14, 339 P_PPI0_D12, P_PPI0_D13, P_PPI0_D14,
318 P_PPI0_D15, 0}; 340 P_PPI0_D15, 0};
319#else 341
320static const u16 ppi0_req_16[] = {P_PPI0_CLK, P_PPI0_FS1, P_PPI0_FS2, 342static const u16 ppi0_req_8[] = {P_PPI0_CLK, P_PPI0_FS1, P_PPI0_FS2,
321 P_PPI0_D0, P_PPI0_D1, P_PPI0_D2, 343 P_PPI0_D0, P_PPI0_D1, P_PPI0_D2,
322 P_PPI0_D3, P_PPI0_D4, P_PPI0_D5, 344 P_PPI0_D3, P_PPI0_D4, P_PPI0_D5,
323 P_PPI0_D6, P_PPI0_D7, 0}; 345 P_PPI0_D6, P_PPI0_D7, 0};
324#endif
325 346
326static inline void bfin_lq035q1_free_ports(void) 347static inline void bfin_lq035q1_free_ports(unsigned ppi16)
327{ 348{
328 peripheral_free_list(ppi0_req_16); 349 if (ppi16)
350 peripheral_free_list(ppi0_req_16);
351 else
352 peripheral_free_list(ppi0_req_8);
353
329 if (ANOMALY_05000400) 354 if (ANOMALY_05000400)
330 gpio_free(P_IDENT(P_PPI0_FS3)); 355 gpio_free(P_IDENT(P_PPI0_FS3));
331} 356}
332 357
333static int __devinit bfin_lq035q1_request_ports(struct platform_device *pdev) 358static int __devinit bfin_lq035q1_request_ports(struct platform_device *pdev,
359 unsigned ppi16)
334{ 360{
361 int ret;
335 /* ANOMALY_05000400 - PPI Does Not Start Properly In Specific Mode: 362 /* ANOMALY_05000400 - PPI Does Not Start Properly In Specific Mode:
336 * Drive PPI_FS3 Low 363 * Drive PPI_FS3 Low
337 */ 364 */
@@ -342,7 +369,12 @@ static int __devinit bfin_lq035q1_request_ports(struct platform_device *pdev)
342 gpio_direction_output(P_IDENT(P_PPI0_FS3), 0); 369 gpio_direction_output(P_IDENT(P_PPI0_FS3), 0);
343 } 370 }
344 371
345 if (peripheral_request_list(ppi0_req_16, DRIVER_NAME)) { 372 if (ppi16)
373 ret = peripheral_request_list(ppi0_req_16, DRIVER_NAME);
374 else
375 ret = peripheral_request_list(ppi0_req_8, DRIVER_NAME);
376
377 if (ret) {
346 dev_err(&pdev->dev, "requesting peripherals failed\n"); 378 dev_err(&pdev->dev, "requesting peripherals failed\n");
347 return -EFAULT; 379 return -EFAULT;
348 } 380 }
@@ -364,7 +396,7 @@ static int bfin_lq035q1_fb_open(struct fb_info *info, int user)
364 396
365 bfin_lq035q1_config_dma(fbi); 397 bfin_lq035q1_config_dma(fbi);
366 bfin_lq035q1_config_ppi(fbi); 398 bfin_lq035q1_config_ppi(fbi);
367 bfin_lq035q1_init_timers(); 399 bfin_lq035q1_init_timers(fbi);
368 400
369 /* start dma */ 401 /* start dma */
370 enable_dma(CH_PPI); 402 enable_dma(CH_PPI);
@@ -402,12 +434,9 @@ static int bfin_lq035q1_fb_release(struct fb_info *info, int user)
402static int bfin_lq035q1_fb_check_var(struct fb_var_screeninfo *var, 434static int bfin_lq035q1_fb_check_var(struct fb_var_screeninfo *var,
403 struct fb_info *info) 435 struct fb_info *info)
404{ 436{
405 switch (var->bits_per_pixel) { 437 struct bfin_lq035q1fb_info *fbi = info->par;
406#if (LCD_BPP == 24) 438
407 case 24:/* TRUECOLOUR, 16m */ 439 if (var->bits_per_pixel == fbi->lcd_bpp) {
408#else
409 case 16:/* DIRECTCOLOUR, 64k */
410#endif
411 var->red.offset = info->var.red.offset; 440 var->red.offset = info->var.red.offset;
412 var->green.offset = info->var.green.offset; 441 var->green.offset = info->var.green.offset;
413 var->blue.offset = info->var.blue.offset; 442 var->blue.offset = info->var.blue.offset;
@@ -420,8 +449,7 @@ static int bfin_lq035q1_fb_check_var(struct fb_var_screeninfo *var,
420 var->red.msb_right = 0; 449 var->red.msb_right = 0;
421 var->green.msb_right = 0; 450 var->green.msb_right = 0;
422 var->blue.msb_right = 0; 451 var->blue.msb_right = 0;
423 break; 452 } else {
424 default:
425 pr_debug("%s: depth not supported: %u BPP\n", __func__, 453 pr_debug("%s: depth not supported: %u BPP\n", __func__,
426 var->bits_per_pixel); 454 var->bits_per_pixel);
427 return -EINVAL; 455 return -EINVAL;
@@ -528,6 +556,7 @@ static int __devinit bfin_lq035q1_probe(struct platform_device *pdev)
528{ 556{
529 struct bfin_lq035q1fb_info *info; 557 struct bfin_lq035q1fb_info *info;
530 struct fb_info *fbinfo; 558 struct fb_info *fbinfo;
559 u32 active_video_mem_offset;
531 int ret; 560 int ret;
532 561
533 ret = request_dma(CH_PPI, DRIVER_NAME"_CH_PPI"); 562 ret = request_dma(CH_PPI, DRIVER_NAME"_CH_PPI");
@@ -550,6 +579,12 @@ static int __devinit bfin_lq035q1_probe(struct platform_device *pdev)
550 579
551 platform_set_drvdata(pdev, fbinfo); 580 platform_set_drvdata(pdev, fbinfo);
552 581
582 ret = bfin_lq035q1_calc_timing(info);
583 if (ret < 0) {
584 dev_err(&pdev->dev, "Failed PPI Mode\n");
585 goto out3;
586 }
587
553 strcpy(fbinfo->fix.id, DRIVER_NAME); 588 strcpy(fbinfo->fix.id, DRIVER_NAME);
554 589
555 fbinfo->fix.type = FB_TYPE_PACKED_PIXELS; 590 fbinfo->fix.type = FB_TYPE_PACKED_PIXELS;
@@ -571,46 +606,48 @@ static int __devinit bfin_lq035q1_probe(struct platform_device *pdev)
571 fbinfo->var.xres_virtual = LCD_X_RES; 606 fbinfo->var.xres_virtual = LCD_X_RES;
572 fbinfo->var.yres = LCD_Y_RES; 607 fbinfo->var.yres = LCD_Y_RES;
573 fbinfo->var.yres_virtual = LCD_Y_RES; 608 fbinfo->var.yres_virtual = LCD_Y_RES;
574 fbinfo->var.bits_per_pixel = LCD_BPP; 609 fbinfo->var.bits_per_pixel = info->lcd_bpp;
575 610
576 if (info->disp_info->mode & LQ035_BGR) { 611 if (info->disp_info->mode & LQ035_BGR) {
577#if (LCD_BPP == 24) 612 if (info->lcd_bpp == 24) {
578 fbinfo->var.red.offset = 0; 613 fbinfo->var.red.offset = 0;
579 fbinfo->var.green.offset = 8; 614 fbinfo->var.green.offset = 8;
580 fbinfo->var.blue.offset = 16; 615 fbinfo->var.blue.offset = 16;
581#else 616 } else {
582 fbinfo->var.red.offset = 0; 617 fbinfo->var.red.offset = 0;
583 fbinfo->var.green.offset = 5; 618 fbinfo->var.green.offset = 5;
584 fbinfo->var.blue.offset = 11; 619 fbinfo->var.blue.offset = 11;
585#endif 620 }
586 } else { 621 } else {
587#if (LCD_BPP == 24) 622 if (info->lcd_bpp == 24) {
588 fbinfo->var.red.offset = 16; 623 fbinfo->var.red.offset = 16;
589 fbinfo->var.green.offset = 8; 624 fbinfo->var.green.offset = 8;
590 fbinfo->var.blue.offset = 0; 625 fbinfo->var.blue.offset = 0;
591#else 626 } else {
592 fbinfo->var.red.offset = 11; 627 fbinfo->var.red.offset = 11;
593 fbinfo->var.green.offset = 5; 628 fbinfo->var.green.offset = 5;
594 fbinfo->var.blue.offset = 0; 629 fbinfo->var.blue.offset = 0;
595#endif 630 }
596 } 631 }
597 632
598 fbinfo->var.transp.offset = 0; 633 fbinfo->var.transp.offset = 0;
599 634
600#if (LCD_BPP == 24) 635 if (info->lcd_bpp == 24) {
601 fbinfo->var.red.length = 8; 636 fbinfo->var.red.length = 8;
602 fbinfo->var.green.length = 8; 637 fbinfo->var.green.length = 8;
603 fbinfo->var.blue.length = 8; 638 fbinfo->var.blue.length = 8;
604#else 639 } else {
605 fbinfo->var.red.length = 5; 640 fbinfo->var.red.length = 5;
606 fbinfo->var.green.length = 6; 641 fbinfo->var.green.length = 6;
607 fbinfo->var.blue.length = 5; 642 fbinfo->var.blue.length = 5;
608#endif 643 }
609 644
610 fbinfo->var.transp.length = 0; 645 fbinfo->var.transp.length = 0;
611 646
612 fbinfo->fix.smem_len = LCD_X_RES * LCD_Y_RES * LCD_BPP / 8 647 active_video_mem_offset = ((U_LINE / 2) * LCD_X_RES * (info->lcd_bpp / 8));
613 + ACTIVE_VIDEO_MEM_OFFSET; 648
649 fbinfo->fix.smem_len = LCD_X_RES * LCD_Y_RES * info->lcd_bpp / 8
650 + active_video_mem_offset;
614 651
615 fbinfo->fix.line_length = fbinfo->var.xres_virtual * 652 fbinfo->fix.line_length = fbinfo->var.xres_virtual *
616 fbinfo->var.bits_per_pixel / 8; 653 fbinfo->var.bits_per_pixel / 8;
@@ -629,8 +666,8 @@ static int __devinit bfin_lq035q1_probe(struct platform_device *pdev)
629 goto out3; 666 goto out3;
630 } 667 }
631 668
632 fbinfo->screen_base = (void *)info->fb_buffer + ACTIVE_VIDEO_MEM_OFFSET; 669 fbinfo->screen_base = (void *)info->fb_buffer + active_video_mem_offset;
633 fbinfo->fix.smem_start = (int)info->fb_buffer + ACTIVE_VIDEO_MEM_OFFSET; 670 fbinfo->fix.smem_start = (int)info->fb_buffer + active_video_mem_offset;
634 671
635 fbinfo->fbops = &bfin_lq035q1_fb_ops; 672 fbinfo->fbops = &bfin_lq035q1_fb_ops;
636 673
@@ -643,7 +680,8 @@ static int __devinit bfin_lq035q1_probe(struct platform_device *pdev)
643 goto out4; 680 goto out4;
644 } 681 }
645 682
646 ret = bfin_lq035q1_request_ports(pdev); 683 ret = bfin_lq035q1_request_ports(pdev,
684 info->disp_info->ppi_mode == USE_RGB565_16_BIT_PPI);
647 if (ret) { 685 if (ret) {
648 dev_err(&pdev->dev, "couldn't request gpio port\n"); 686 dev_err(&pdev->dev, "couldn't request gpio port\n");
649 goto out6; 687 goto out6;
@@ -693,7 +731,7 @@ static int __devinit bfin_lq035q1_probe(struct platform_device *pdev)
693 } 731 }
694 732
695 dev_info(&pdev->dev, "%dx%d %d-bit RGB FrameBuffer initialized\n", 733 dev_info(&pdev->dev, "%dx%d %d-bit RGB FrameBuffer initialized\n",
696 LCD_X_RES, LCD_Y_RES, LCD_BPP); 734 LCD_X_RES, LCD_Y_RES, info->lcd_bpp);
697 735
698 return 0; 736 return 0;
699 737
@@ -705,7 +743,8 @@ static int __devinit bfin_lq035q1_probe(struct platform_device *pdev)
705 out8: 743 out8:
706 free_irq(info->irq, info); 744 free_irq(info->irq, info);
707 out7: 745 out7:
708 bfin_lq035q1_free_ports(); 746 bfin_lq035q1_free_ports(info->disp_info->ppi_mode ==
747 USE_RGB565_16_BIT_PPI);
709 out6: 748 out6:
710 fb_dealloc_cmap(&fbinfo->cmap); 749 fb_dealloc_cmap(&fbinfo->cmap);
711 out4: 750 out4:
@@ -742,7 +781,8 @@ static int __devexit bfin_lq035q1_remove(struct platform_device *pdev)
742 781
743 fb_dealloc_cmap(&fbinfo->cmap); 782 fb_dealloc_cmap(&fbinfo->cmap);
744 783
745 bfin_lq035q1_free_ports(); 784 bfin_lq035q1_free_ports(info->disp_info->ppi_mode ==
785 USE_RGB565_16_BIT_PPI);
746 786
747 platform_set_drvdata(pdev, NULL); 787 platform_set_drvdata(pdev, NULL);
748 framebuffer_release(fbinfo); 788 framebuffer_release(fbinfo);
@@ -781,7 +821,7 @@ static int bfin_lq035q1_resume(struct device *dev)
781 821
782 bfin_lq035q1_config_dma(info); 822 bfin_lq035q1_config_dma(info);
783 bfin_lq035q1_config_ppi(info); 823 bfin_lq035q1_config_ppi(info);
784 bfin_lq035q1_init_timers(); 824 bfin_lq035q1_init_timers(info);
785 825
786 /* start dma */ 826 /* start dma */
787 enable_dma(CH_PPI); 827 enable_dma(CH_PPI);
diff --git a/drivers/video/da8xx-fb.c b/drivers/video/da8xx-fb.c
index 8d244ba0f601..cad7d45c8bac 100644
--- a/drivers/video/da8xx-fb.c
+++ b/drivers/video/da8xx-fb.c
@@ -36,7 +36,9 @@
36#define DRIVER_NAME "da8xx_lcdc" 36#define DRIVER_NAME "da8xx_lcdc"
37 37
38/* LCD Status Register */ 38/* LCD Status Register */
39#define LCD_END_OF_FRAME1 BIT(9)
39#define LCD_END_OF_FRAME0 BIT(8) 40#define LCD_END_OF_FRAME0 BIT(8)
41#define LCD_PL_LOAD_DONE BIT(6)
40#define LCD_FIFO_UNDERFLOW BIT(5) 42#define LCD_FIFO_UNDERFLOW BIT(5)
41#define LCD_SYNC_LOST BIT(2) 43#define LCD_SYNC_LOST BIT(2)
42 44
@@ -58,11 +60,13 @@
58#define LCD_PALETTE_LOAD_MODE(x) ((x) << 20) 60#define LCD_PALETTE_LOAD_MODE(x) ((x) << 20)
59#define PALETTE_AND_DATA 0x00 61#define PALETTE_AND_DATA 0x00
60#define PALETTE_ONLY 0x01 62#define PALETTE_ONLY 0x01
63#define DATA_ONLY 0x02
61 64
62#define LCD_MONO_8BIT_MODE BIT(9) 65#define LCD_MONO_8BIT_MODE BIT(9)
63#define LCD_RASTER_ORDER BIT(8) 66#define LCD_RASTER_ORDER BIT(8)
64#define LCD_TFT_MODE BIT(7) 67#define LCD_TFT_MODE BIT(7)
65#define LCD_UNDERFLOW_INT_ENA BIT(6) 68#define LCD_UNDERFLOW_INT_ENA BIT(6)
69#define LCD_PL_ENABLE BIT(4)
66#define LCD_MONOCHROME_MODE BIT(1) 70#define LCD_MONOCHROME_MODE BIT(1)
67#define LCD_RASTER_ENABLE BIT(0) 71#define LCD_RASTER_ENABLE BIT(0)
68#define LCD_TFT_ALT_ENABLE BIT(23) 72#define LCD_TFT_ALT_ENABLE BIT(23)
@@ -87,6 +91,10 @@
87#define LCD_DMA_CTRL_REG 0x40 91#define LCD_DMA_CTRL_REG 0x40
88#define LCD_DMA_FRM_BUF_BASE_ADDR_0_REG 0x44 92#define LCD_DMA_FRM_BUF_BASE_ADDR_0_REG 0x44
89#define LCD_DMA_FRM_BUF_CEILING_ADDR_0_REG 0x48 93#define LCD_DMA_FRM_BUF_CEILING_ADDR_0_REG 0x48
94#define LCD_DMA_FRM_BUF_BASE_ADDR_1_REG 0x4C
95#define LCD_DMA_FRM_BUF_CEILING_ADDR_1_REG 0x50
96
97#define LCD_NUM_BUFFERS 2
90 98
91#define WSI_TIMEOUT 50 99#define WSI_TIMEOUT 50
92#define PALETTE_SIZE 256 100#define PALETTE_SIZE 256
@@ -111,13 +119,20 @@ static inline void lcdc_write(unsigned int val, unsigned int addr)
111struct da8xx_fb_par { 119struct da8xx_fb_par {
112 resource_size_t p_palette_base; 120 resource_size_t p_palette_base;
113 unsigned char *v_palette_base; 121 unsigned char *v_palette_base;
122 dma_addr_t vram_phys;
123 unsigned long vram_size;
124 void *vram_virt;
125 unsigned int dma_start;
126 unsigned int dma_end;
114 struct clk *lcdc_clk; 127 struct clk *lcdc_clk;
115 int irq; 128 int irq;
116 unsigned short pseudo_palette[16]; 129 unsigned short pseudo_palette[16];
117 unsigned int databuf_sz;
118 unsigned int palette_sz; 130 unsigned int palette_sz;
119 unsigned int pxl_clk; 131 unsigned int pxl_clk;
120 int blank; 132 int blank;
133 wait_queue_head_t vsync_wait;
134 int vsync_flag;
135 int vsync_timeout;
121#ifdef CONFIG_CPU_FREQ 136#ifdef CONFIG_CPU_FREQ
122 struct notifier_block freq_transition; 137 struct notifier_block freq_transition;
123#endif 138#endif
@@ -148,9 +163,9 @@ static struct fb_fix_screeninfo da8xx_fb_fix __devinitdata = {
148 .type = FB_TYPE_PACKED_PIXELS, 163 .type = FB_TYPE_PACKED_PIXELS,
149 .type_aux = 0, 164 .type_aux = 0,
150 .visual = FB_VISUAL_PSEUDOCOLOR, 165 .visual = FB_VISUAL_PSEUDOCOLOR,
151 .xpanstep = 1, 166 .xpanstep = 0,
152 .ypanstep = 1, 167 .ypanstep = 1,
153 .ywrapstep = 1, 168 .ywrapstep = 0,
154 .accel = FB_ACCEL_NONE 169 .accel = FB_ACCEL_NONE
155}; 170};
156 171
@@ -221,22 +236,48 @@ static inline void lcd_disable_raster(void)
221 236
222static void lcd_blit(int load_mode, struct da8xx_fb_par *par) 237static void lcd_blit(int load_mode, struct da8xx_fb_par *par)
223{ 238{
224 u32 tmp = par->p_palette_base + par->databuf_sz - 4; 239 u32 start;
225 u32 reg; 240 u32 end;
241 u32 reg_ras;
242 u32 reg_dma;
243
244 /* init reg to clear PLM (loading mode) fields */
245 reg_ras = lcdc_read(LCD_RASTER_CTRL_REG);
246 reg_ras &= ~(3 << 20);
247
248 reg_dma = lcdc_read(LCD_DMA_CTRL_REG);
249
250 if (load_mode == LOAD_DATA) {
251 start = par->dma_start;
252 end = par->dma_end;
253
254 reg_ras |= LCD_PALETTE_LOAD_MODE(DATA_ONLY);
255 reg_dma |= LCD_END_OF_FRAME_INT_ENA;
256 reg_dma |= LCD_DUAL_FRAME_BUFFER_ENABLE;
257
258 lcdc_write(start, LCD_DMA_FRM_BUF_BASE_ADDR_0_REG);
259 lcdc_write(end, LCD_DMA_FRM_BUF_CEILING_ADDR_0_REG);
260 lcdc_write(start, LCD_DMA_FRM_BUF_BASE_ADDR_1_REG);
261 lcdc_write(end, LCD_DMA_FRM_BUF_CEILING_ADDR_1_REG);
262 } else if (load_mode == LOAD_PALETTE) {
263 start = par->p_palette_base;
264 end = start + par->palette_sz - 1;
265
266 reg_ras |= LCD_PALETTE_LOAD_MODE(PALETTE_ONLY);
267 reg_ras |= LCD_PL_ENABLE;
268
269 lcdc_write(start, LCD_DMA_FRM_BUF_BASE_ADDR_0_REG);
270 lcdc_write(end, LCD_DMA_FRM_BUF_CEILING_ADDR_0_REG);
271 }
226 272
227 /* Update the databuf in the hw. */ 273 lcdc_write(reg_dma, LCD_DMA_CTRL_REG);
228 lcdc_write(par->p_palette_base, LCD_DMA_FRM_BUF_BASE_ADDR_0_REG); 274 lcdc_write(reg_ras, LCD_RASTER_CTRL_REG);
229 lcdc_write(tmp, LCD_DMA_FRM_BUF_CEILING_ADDR_0_REG);
230 275
231 /* Start the DMA. */ 276 /*
232 reg = lcdc_read(LCD_RASTER_CTRL_REG); 277 * The Raster enable bit must be set after all other control fields are
233 reg &= ~(3 << 20); 278 * set.
234 if (load_mode == LOAD_DATA) 279 */
235 reg |= LCD_PALETTE_LOAD_MODE(PALETTE_AND_DATA); 280 lcd_enable_raster();
236 else if (load_mode == LOAD_PALETTE)
237 reg |= LCD_PALETTE_LOAD_MODE(PALETTE_ONLY);
238
239 lcdc_write(reg, LCD_RASTER_CTRL_REG);
240} 281}
241 282
242/* Configure the Burst Size of DMA */ 283/* Configure the Burst Size of DMA */
@@ -368,12 +409,8 @@ static int lcd_cfg_display(const struct lcd_ctrl_config *cfg)
368static int lcd_cfg_frame_buffer(struct da8xx_fb_par *par, u32 width, u32 height, 409static int lcd_cfg_frame_buffer(struct da8xx_fb_par *par, u32 width, u32 height,
369 u32 bpp, u32 raster_order) 410 u32 bpp, u32 raster_order)
370{ 411{
371 u32 bpl, reg; 412 u32 reg;
372 413
373 /* Disable Dual Frame Buffer. */
374 reg = lcdc_read(LCD_DMA_CTRL_REG);
375 lcdc_write(reg & ~LCD_DUAL_FRAME_BUFFER_ENABLE,
376 LCD_DMA_CTRL_REG);
377 /* Set the Panel Width */ 414 /* Set the Panel Width */
378 /* Pixels per line = (PPL + 1)*16 */ 415 /* Pixels per line = (PPL + 1)*16 */
379 /*0x3F in bits 4..9 gives max horisontal resolution = 1024 pixels*/ 416 /*0x3F in bits 4..9 gives max horisontal resolution = 1024 pixels*/
@@ -410,9 +447,6 @@ static int lcd_cfg_frame_buffer(struct da8xx_fb_par *par, u32 width, u32 height,
410 return -EINVAL; 447 return -EINVAL;
411 } 448 }
412 449
413 bpl = width * bpp / 8;
414 par->databuf_sz = height * bpl + par->palette_sz;
415
416 return 0; 450 return 0;
417} 451}
418 452
@@ -421,8 +455,9 @@ static int fb_setcolreg(unsigned regno, unsigned red, unsigned green,
421 struct fb_info *info) 455 struct fb_info *info)
422{ 456{
423 struct da8xx_fb_par *par = info->par; 457 struct da8xx_fb_par *par = info->par;
424 unsigned short *palette = (unsigned short *)par->v_palette_base; 458 unsigned short *palette = (unsigned short *) par->v_palette_base;
425 u_short pal; 459 u_short pal;
460 int update_hw = 0;
426 461
427 if (regno > 255) 462 if (regno > 255)
428 return 1; 463 return 1;
@@ -439,8 +474,10 @@ static int fb_setcolreg(unsigned regno, unsigned red, unsigned green,
439 pal |= (green & 0x00f0); 474 pal |= (green & 0x00f0);
440 pal |= (blue & 0x000f); 475 pal |= (blue & 0x000f);
441 476
442 palette[regno] = pal; 477 if (palette[regno] != pal) {
443 478 update_hw = 1;
479 palette[regno] = pal;
480 }
444 } else if ((info->var.bits_per_pixel == 16) && regno < 16) { 481 } else if ((info->var.bits_per_pixel == 16) && regno < 16) {
445 red >>= (16 - info->var.red.length); 482 red >>= (16 - info->var.red.length);
446 red <<= info->var.red.offset; 483 red <<= info->var.red.offset;
@@ -453,9 +490,16 @@ static int fb_setcolreg(unsigned regno, unsigned red, unsigned green,
453 490
454 par->pseudo_palette[regno] = red | green | blue; 491 par->pseudo_palette[regno] = red | green | blue;
455 492
456 palette[0] = 0x4000; 493 if (palette[0] != 0x4000) {
494 update_hw = 1;
495 palette[0] = 0x4000;
496 }
457 } 497 }
458 498
499 /* Update the palette in the h/w as needed. */
500 if (update_hw)
501 lcd_blit(LOAD_PALETTE, par);
502
459 return 0; 503 return 0;
460} 504}
461 505
@@ -541,15 +585,54 @@ static int lcd_init(struct da8xx_fb_par *par, const struct lcd_ctrl_config *cfg,
541 585
542static irqreturn_t lcdc_irq_handler(int irq, void *arg) 586static irqreturn_t lcdc_irq_handler(int irq, void *arg)
543{ 587{
588 struct da8xx_fb_par *par = arg;
544 u32 stat = lcdc_read(LCD_STAT_REG); 589 u32 stat = lcdc_read(LCD_STAT_REG);
590 u32 reg_ras;
545 591
546 if ((stat & LCD_SYNC_LOST) && (stat & LCD_FIFO_UNDERFLOW)) { 592 if ((stat & LCD_SYNC_LOST) && (stat & LCD_FIFO_UNDERFLOW)) {
547 lcd_disable_raster(); 593 lcd_disable_raster();
548 lcdc_write(stat, LCD_STAT_REG); 594 lcdc_write(stat, LCD_STAT_REG);
549 lcd_enable_raster(); 595 lcd_enable_raster();
550 } else 596 } else if (stat & LCD_PL_LOAD_DONE) {
597 /*
598 * Must disable raster before changing state of any control bit.
599 * And also must be disabled before clearing the PL loading
600 * interrupt via the following write to the status register. If
601 * this is done after then one gets multiple PL done interrupts.
602 */
603 lcd_disable_raster();
604
551 lcdc_write(stat, LCD_STAT_REG); 605 lcdc_write(stat, LCD_STAT_REG);
552 606
607 /* Disable PL completion inerrupt */
608 reg_ras = lcdc_read(LCD_RASTER_CTRL_REG);
609 reg_ras &= ~LCD_PL_ENABLE;
610 lcdc_write(reg_ras, LCD_RASTER_CTRL_REG);
611
612 /* Setup and start data loading mode */
613 lcd_blit(LOAD_DATA, par);
614 } else {
615 lcdc_write(stat, LCD_STAT_REG);
616
617 if (stat & LCD_END_OF_FRAME0) {
618 lcdc_write(par->dma_start,
619 LCD_DMA_FRM_BUF_BASE_ADDR_0_REG);
620 lcdc_write(par->dma_end,
621 LCD_DMA_FRM_BUF_CEILING_ADDR_0_REG);
622 par->vsync_flag = 1;
623 wake_up_interruptible(&par->vsync_wait);
624 }
625
626 if (stat & LCD_END_OF_FRAME1) {
627 lcdc_write(par->dma_start,
628 LCD_DMA_FRM_BUF_BASE_ADDR_1_REG);
629 lcdc_write(par->dma_end,
630 LCD_DMA_FRM_BUF_CEILING_ADDR_1_REG);
631 par->vsync_flag = 1;
632 wake_up_interruptible(&par->vsync_wait);
633 }
634 }
635
553 return IRQ_HANDLED; 636 return IRQ_HANDLED;
554} 637}
555 638
@@ -654,9 +737,10 @@ static int __devexit fb_remove(struct platform_device *dev)
654 737
655 unregister_framebuffer(info); 738 unregister_framebuffer(info);
656 fb_dealloc_cmap(&info->cmap); 739 fb_dealloc_cmap(&info->cmap);
657 dma_free_coherent(NULL, par->databuf_sz + PAGE_SIZE, 740 dma_free_coherent(NULL, PALETTE_SIZE, par->v_palette_base,
658 info->screen_base - PAGE_SIZE, 741 par->p_palette_base);
659 info->fix.smem_start); 742 dma_free_coherent(NULL, par->vram_size, par->vram_virt,
743 par->vram_phys);
660 free_irq(par->irq, par); 744 free_irq(par->irq, par);
661 clk_disable(par->lcdc_clk); 745 clk_disable(par->lcdc_clk);
662 clk_put(par->lcdc_clk); 746 clk_put(par->lcdc_clk);
@@ -668,6 +752,39 @@ static int __devexit fb_remove(struct platform_device *dev)
668 return 0; 752 return 0;
669} 753}
670 754
755/*
756 * Function to wait for vertical sync which for this LCD peripheral
757 * translates into waiting for the current raster frame to complete.
758 */
759static int fb_wait_for_vsync(struct fb_info *info)
760{
761 struct da8xx_fb_par *par = info->par;
762 int ret;
763
764 /*
765 * Set flag to 0 and wait for isr to set to 1. It would seem there is a
766 * race condition here where the ISR could have occured just before or
767 * just after this set. But since we are just coarsely waiting for
768 * a frame to complete then that's OK. i.e. if the frame completed
769 * just before this code executed then we have to wait another full
770 * frame time but there is no way to avoid such a situation. On the
771 * other hand if the frame completed just after then we don't need
772 * to wait long at all. Either way we are guaranteed to return to the
773 * user immediately after a frame completion which is all that is
774 * required.
775 */
776 par->vsync_flag = 0;
777 ret = wait_event_interruptible_timeout(par->vsync_wait,
778 par->vsync_flag != 0,
779 par->vsync_timeout);
780 if (ret < 0)
781 return ret;
782 if (ret == 0)
783 return -ETIMEDOUT;
784
785 return 0;
786}
787
671static int fb_ioctl(struct fb_info *info, unsigned int cmd, 788static int fb_ioctl(struct fb_info *info, unsigned int cmd,
672 unsigned long arg) 789 unsigned long arg)
673{ 790{
@@ -697,6 +814,8 @@ static int fb_ioctl(struct fb_info *info, unsigned int cmd,
697 sync_arg.pulse_width, 814 sync_arg.pulse_width,
698 sync_arg.front_porch); 815 sync_arg.front_porch);
699 break; 816 break;
817 case FBIO_WAITFORVSYNC:
818 return fb_wait_for_vsync(info);
700 default: 819 default:
701 return -EINVAL; 820 return -EINVAL;
702 } 821 }
@@ -732,10 +851,47 @@ static int cfb_blank(int blank, struct fb_info *info)
732 return ret; 851 return ret;
733} 852}
734 853
854/*
855 * Set new x,y offsets in the virtual display for the visible area and switch
856 * to the new mode.
857 */
858static int da8xx_pan_display(struct fb_var_screeninfo *var,
859 struct fb_info *fbi)
860{
861 int ret = 0;
862 struct fb_var_screeninfo new_var;
863 struct da8xx_fb_par *par = fbi->par;
864 struct fb_fix_screeninfo *fix = &fbi->fix;
865 unsigned int end;
866 unsigned int start;
867
868 if (var->xoffset != fbi->var.xoffset ||
869 var->yoffset != fbi->var.yoffset) {
870 memcpy(&new_var, &fbi->var, sizeof(new_var));
871 new_var.xoffset = var->xoffset;
872 new_var.yoffset = var->yoffset;
873 if (fb_check_var(&new_var, fbi))
874 ret = -EINVAL;
875 else {
876 memcpy(&fbi->var, &new_var, sizeof(new_var));
877
878 start = fix->smem_start +
879 new_var.yoffset * fix->line_length +
880 new_var.xoffset * var->bits_per_pixel / 8;
881 end = start + var->yres * fix->line_length - 1;
882 par->dma_start = start;
883 par->dma_end = end;
884 }
885 }
886
887 return ret;
888}
889
735static struct fb_ops da8xx_fb_ops = { 890static struct fb_ops da8xx_fb_ops = {
736 .owner = THIS_MODULE, 891 .owner = THIS_MODULE,
737 .fb_check_var = fb_check_var, 892 .fb_check_var = fb_check_var,
738 .fb_setcolreg = fb_setcolreg, 893 .fb_setcolreg = fb_setcolreg,
894 .fb_pan_display = da8xx_pan_display,
739 .fb_ioctl = fb_ioctl, 895 .fb_ioctl = fb_ioctl,
740 .fb_fillrect = cfb_fillrect, 896 .fb_fillrect = cfb_fillrect,
741 .fb_copyarea = cfb_copyarea, 897 .fb_copyarea = cfb_copyarea,
@@ -829,40 +985,53 @@ static int __init fb_probe(struct platform_device *device)
829 } 985 }
830 986
831 /* allocate frame buffer */ 987 /* allocate frame buffer */
832 da8xx_fb_info->screen_base = dma_alloc_coherent(NULL, 988 par->vram_size = lcdc_info->width * lcdc_info->height * lcd_cfg->bpp;
833 par->databuf_sz + PAGE_SIZE, 989 par->vram_size = PAGE_ALIGN(par->vram_size/8);
834 (resource_size_t *) 990 par->vram_size = par->vram_size * LCD_NUM_BUFFERS;
835 &da8xx_fb_info->fix.smem_start, 991
836 GFP_KERNEL | GFP_DMA); 992 par->vram_virt = dma_alloc_coherent(NULL,
837 993 par->vram_size,
838 if (!da8xx_fb_info->screen_base) { 994 (resource_size_t *) &par->vram_phys,
995 GFP_KERNEL | GFP_DMA);
996 if (!par->vram_virt) {
839 dev_err(&device->dev, 997 dev_err(&device->dev,
840 "GLCD: kmalloc for frame buffer failed\n"); 998 "GLCD: kmalloc for frame buffer failed\n");
841 ret = -EINVAL; 999 ret = -EINVAL;
842 goto err_release_fb; 1000 goto err_release_fb;
843 } 1001 }
844 1002
845 /* move palette base pointer by (PAGE_SIZE - palette_sz) bytes */ 1003 da8xx_fb_info->screen_base = (char __iomem *) par->vram_virt;
846 par->v_palette_base = da8xx_fb_info->screen_base + 1004 da8xx_fb_fix.smem_start = par->vram_phys;
847 (PAGE_SIZE - par->palette_sz); 1005 da8xx_fb_fix.smem_len = par->vram_size;
848 par->p_palette_base = da8xx_fb_info->fix.smem_start + 1006 da8xx_fb_fix.line_length = (lcdc_info->width * lcd_cfg->bpp) / 8;
849 (PAGE_SIZE - par->palette_sz); 1007
850 1008 par->dma_start = par->vram_phys;
851 /* the rest of the frame buffer is pixel data */ 1009 par->dma_end = par->dma_start + lcdc_info->height *
852 da8xx_fb_info->screen_base = par->v_palette_base + par->palette_sz; 1010 da8xx_fb_fix.line_length - 1;
853 da8xx_fb_fix.smem_start = par->p_palette_base + par->palette_sz; 1011
854 da8xx_fb_fix.smem_len = par->databuf_sz - par->palette_sz; 1012 /* allocate palette buffer */
855 da8xx_fb_fix.line_length = (lcdc_info->width * lcd_cfg->bpp) / 8; 1013 par->v_palette_base = dma_alloc_coherent(NULL,
1014 PALETTE_SIZE,
1015 (resource_size_t *)
1016 &par->p_palette_base,
1017 GFP_KERNEL | GFP_DMA);
1018 if (!par->v_palette_base) {
1019 dev_err(&device->dev,
1020 "GLCD: kmalloc for palette buffer failed\n");
1021 ret = -EINVAL;
1022 goto err_release_fb_mem;
1023 }
1024 memset(par->v_palette_base, 0, PALETTE_SIZE);
856 1025
857 par->irq = platform_get_irq(device, 0); 1026 par->irq = platform_get_irq(device, 0);
858 if (par->irq < 0) { 1027 if (par->irq < 0) {
859 ret = -ENOENT; 1028 ret = -ENOENT;
860 goto err_release_fb_mem; 1029 goto err_release_pl_mem;
861 } 1030 }
862 1031
863 ret = request_irq(par->irq, lcdc_irq_handler, 0, DRIVER_NAME, par); 1032 ret = request_irq(par->irq, lcdc_irq_handler, 0, DRIVER_NAME, par);
864 if (ret) 1033 if (ret)
865 goto err_release_fb_mem; 1034 goto err_release_pl_mem;
866 1035
867 /* Initialize par */ 1036 /* Initialize par */
868 da8xx_fb_info->var.bits_per_pixel = lcd_cfg->bpp; 1037 da8xx_fb_info->var.bits_per_pixel = lcd_cfg->bpp;
@@ -870,8 +1039,8 @@ static int __init fb_probe(struct platform_device *device)
870 da8xx_fb_var.xres = lcdc_info->width; 1039 da8xx_fb_var.xres = lcdc_info->width;
871 da8xx_fb_var.xres_virtual = lcdc_info->width; 1040 da8xx_fb_var.xres_virtual = lcdc_info->width;
872 1041
873 da8xx_fb_var.yres = lcdc_info->height; 1042 da8xx_fb_var.yres = lcdc_info->height;
874 da8xx_fb_var.yres_virtual = lcdc_info->height; 1043 da8xx_fb_var.yres_virtual = lcdc_info->height * LCD_NUM_BUFFERS;
875 1044
876 da8xx_fb_var.grayscale = 1045 da8xx_fb_var.grayscale =
877 lcd_cfg->p_disp_panel->panel_shade == MONOCHROME ? 1 : 0; 1046 lcd_cfg->p_disp_panel->panel_shade == MONOCHROME ? 1 : 0;
@@ -892,18 +1061,18 @@ static int __init fb_probe(struct platform_device *device)
892 ret = fb_alloc_cmap(&da8xx_fb_info->cmap, PALETTE_SIZE, 0); 1061 ret = fb_alloc_cmap(&da8xx_fb_info->cmap, PALETTE_SIZE, 0);
893 if (ret) 1062 if (ret)
894 goto err_free_irq; 1063 goto err_free_irq;
895
896 /* First palette_sz byte of the frame buffer is the palette */
897 da8xx_fb_info->cmap.len = par->palette_sz; 1064 da8xx_fb_info->cmap.len = par->palette_sz;
898 1065
899 /* Flush the buffer to the screen. */
900 lcd_blit(LOAD_DATA, par);
901
902 /* initialize var_screeninfo */ 1066 /* initialize var_screeninfo */
903 da8xx_fb_var.activate = FB_ACTIVATE_FORCE; 1067 da8xx_fb_var.activate = FB_ACTIVATE_FORCE;
904 fb_set_var(da8xx_fb_info, &da8xx_fb_var); 1068 fb_set_var(da8xx_fb_info, &da8xx_fb_var);
905 1069
906 dev_set_drvdata(&device->dev, da8xx_fb_info); 1070 dev_set_drvdata(&device->dev, da8xx_fb_info);
1071
1072 /* initialize the vsync wait queue */
1073 init_waitqueue_head(&par->vsync_wait);
1074 par->vsync_timeout = HZ / 5;
1075
907 /* Register the Frame Buffer */ 1076 /* Register the Frame Buffer */
908 if (register_framebuffer(da8xx_fb_info) < 0) { 1077 if (register_framebuffer(da8xx_fb_info) < 0) {
909 dev_err(&device->dev, 1078 dev_err(&device->dev,
@@ -919,10 +1088,6 @@ static int __init fb_probe(struct platform_device *device)
919 goto err_cpu_freq; 1088 goto err_cpu_freq;
920 } 1089 }
921#endif 1090#endif
922
923 /* enable raster engine */
924 lcd_enable_raster();
925
926 return 0; 1091 return 0;
927 1092
928#ifdef CONFIG_CPU_FREQ 1093#ifdef CONFIG_CPU_FREQ
@@ -936,10 +1101,12 @@ err_dealloc_cmap:
936err_free_irq: 1101err_free_irq:
937 free_irq(par->irq, par); 1102 free_irq(par->irq, par);
938 1103
1104err_release_pl_mem:
1105 dma_free_coherent(NULL, PALETTE_SIZE, par->v_palette_base,
1106 par->p_palette_base);
1107
939err_release_fb_mem: 1108err_release_fb_mem:
940 dma_free_coherent(NULL, par->databuf_sz + PAGE_SIZE, 1109 dma_free_coherent(NULL, par->vram_size, par->vram_virt, par->vram_phys);
941 da8xx_fb_info->screen_base - PAGE_SIZE,
942 da8xx_fb_info->fix.smem_start);
943 1110
944err_release_fb: 1111err_release_fb:
945 framebuffer_release(da8xx_fb_info); 1112 framebuffer_release(da8xx_fb_info);
diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c
index 6113c47e095a..1105a591dcc1 100644
--- a/drivers/video/fb_defio.c
+++ b/drivers/video/fb_defio.c
@@ -155,25 +155,41 @@ static void fb_deferred_io_work(struct work_struct *work)
155{ 155{
156 struct fb_info *info = container_of(work, struct fb_info, 156 struct fb_info *info = container_of(work, struct fb_info,
157 deferred_work.work); 157 deferred_work.work);
158 struct list_head *node, *next;
159 struct page *cur;
160 struct fb_deferred_io *fbdefio = info->fbdefio; 158 struct fb_deferred_io *fbdefio = info->fbdefio;
159 struct page *page, *tmp_page;
160 struct list_head *node, *tmp_node;
161 struct list_head non_dirty;
162
163 INIT_LIST_HEAD(&non_dirty);
161 164
162 /* here we mkclean the pages, then do all deferred IO */ 165 /* here we mkclean the pages, then do all deferred IO */
163 mutex_lock(&fbdefio->lock); 166 mutex_lock(&fbdefio->lock);
164 list_for_each_entry(cur, &fbdefio->pagelist, lru) { 167 list_for_each_entry_safe(page, tmp_page, &fbdefio->pagelist, lru) {
165 lock_page(cur); 168 lock_page(page);
166 page_mkclean(cur); 169 /*
167 unlock_page(cur); 170 * The workqueue callback can be triggered after a
171 * ->page_mkwrite() call but before the PTE has been marked
172 * dirty. In this case page_mkclean() won't "rearm" the page.
173 *
174 * To avoid this, remove those "non-dirty" pages from the
175 * pagelist before calling the driver's callback, then add
176 * them back to get processed on the next work iteration.
177 * At that time, their PTEs will hopefully be dirty for real.
178 */
179 if (!page_mkclean(page))
180 list_move_tail(&page->lru, &non_dirty);
181 unlock_page(page);
168 } 182 }
169 183
170 /* driver's callback with pagelist */ 184 /* driver's callback with pagelist */
171 fbdefio->deferred_io(info, &fbdefio->pagelist); 185 fbdefio->deferred_io(info, &fbdefio->pagelist);
172 186
173 /* clear the list */ 187 /* clear the list... */
174 list_for_each_safe(node, next, &fbdefio->pagelist) { 188 list_for_each_safe(node, tmp_node, &fbdefio->pagelist) {
175 list_del(node); 189 list_del(node);
176 } 190 }
191 /* ... and add back the "non-dirty" pages to the list */
192 list_splice_tail(&non_dirty, &fbdefio->pagelist);
177 mutex_unlock(&fbdefio->lock); 193 mutex_unlock(&fbdefio->lock);
178} 194}
179 195
@@ -202,6 +218,7 @@ EXPORT_SYMBOL_GPL(fb_deferred_io_open);
202void fb_deferred_io_cleanup(struct fb_info *info) 218void fb_deferred_io_cleanup(struct fb_info *info)
203{ 219{
204 struct fb_deferred_io *fbdefio = info->fbdefio; 220 struct fb_deferred_io *fbdefio = info->fbdefio;
221 struct list_head *node, *tmp_node;
205 struct page *page; 222 struct page *page;
206 int i; 223 int i;
207 224
@@ -209,6 +226,13 @@ void fb_deferred_io_cleanup(struct fb_info *info)
209 cancel_delayed_work(&info->deferred_work); 226 cancel_delayed_work(&info->deferred_work);
210 flush_scheduled_work(); 227 flush_scheduled_work();
211 228
229 /* the list may have still some non-dirty pages at this point */
230 mutex_lock(&fbdefio->lock);
231 list_for_each_safe(node, tmp_node, &fbdefio->pagelist) {
232 list_del(node);
233 }
234 mutex_unlock(&fbdefio->lock);
235
212 /* clear out the mapping that we setup */ 236 /* clear out the mapping that we setup */
213 for (i = 0 ; i < info->fix.smem_len; i += PAGE_SIZE) { 237 for (i = 0 ; i < info->fix.smem_len; i += PAGE_SIZE) {
214 page = fb_deferred_io_page(info, i); 238 page = fb_deferred_io_page(info, i);
diff --git a/drivers/video/hgafb.c b/drivers/video/hgafb.c
index 8bbf251f83d9..af8f0f2cc782 100644
--- a/drivers/video/hgafb.c
+++ b/drivers/video/hgafb.c
@@ -106,7 +106,7 @@ static DEFINE_SPINLOCK(hga_reg_lock);
106 106
107/* Framebuffer driver structures */ 107/* Framebuffer driver structures */
108 108
109static struct fb_var_screeninfo __initdata hga_default_var = { 109static struct fb_var_screeninfo hga_default_var __devinitdata = {
110 .xres = 720, 110 .xres = 720,
111 .yres = 348, 111 .yres = 348,
112 .xres_virtual = 720, 112 .xres_virtual = 720,
@@ -120,7 +120,7 @@ static struct fb_var_screeninfo __initdata hga_default_var = {
120 .width = -1, 120 .width = -1,
121}; 121};
122 122
123static struct fb_fix_screeninfo __initdata hga_fix = { 123static struct fb_fix_screeninfo hga_fix __devinitdata = {
124 .id = "HGA", 124 .id = "HGA",
125 .type = FB_TYPE_PACKED_PIXELS, /* (not sure) */ 125 .type = FB_TYPE_PACKED_PIXELS, /* (not sure) */
126 .visual = FB_VISUAL_MONO10, 126 .visual = FB_VISUAL_MONO10,
@@ -276,7 +276,7 @@ static void hga_blank(int blank_mode)
276 spin_unlock_irqrestore(&hga_reg_lock, flags); 276 spin_unlock_irqrestore(&hga_reg_lock, flags);
277} 277}
278 278
279static int __init hga_card_detect(void) 279static int __devinit hga_card_detect(void)
280{ 280{
281 int count = 0; 281 int count = 0;
282 void __iomem *p, *q; 282 void __iomem *p, *q;
@@ -596,7 +596,7 @@ static int __devinit hgafb_probe(struct platform_device *pdev)
596 return 0; 596 return 0;
597} 597}
598 598
599static int hgafb_remove(struct platform_device *pdev) 599static int __devexit hgafb_remove(struct platform_device *pdev)
600{ 600{
601 struct fb_info *info = platform_get_drvdata(pdev); 601 struct fb_info *info = platform_get_drvdata(pdev);
602 602
@@ -621,7 +621,7 @@ static int hgafb_remove(struct platform_device *pdev)
621 621
622static struct platform_driver hgafb_driver = { 622static struct platform_driver hgafb_driver = {
623 .probe = hgafb_probe, 623 .probe = hgafb_probe,
624 .remove = hgafb_remove, 624 .remove = __devexit_p(hgafb_remove),
625 .driver = { 625 .driver = {
626 .name = "hgafb", 626 .name = "hgafb",
627 }, 627 },
diff --git a/drivers/video/hitfb.c b/drivers/video/hitfb.c
index 393f3f3d3dfe..cfb8d6451014 100644
--- a/drivers/video/hitfb.c
+++ b/drivers/video/hitfb.c
@@ -30,14 +30,14 @@
30 30
31#define WIDTH 640 31#define WIDTH 640
32 32
33static struct fb_var_screeninfo hitfb_var __initdata = { 33static struct fb_var_screeninfo hitfb_var __devinitdata = {
34 .activate = FB_ACTIVATE_NOW, 34 .activate = FB_ACTIVATE_NOW,
35 .height = -1, 35 .height = -1,
36 .width = -1, 36 .width = -1,
37 .vmode = FB_VMODE_NONINTERLACED, 37 .vmode = FB_VMODE_NONINTERLACED,
38}; 38};
39 39
40static struct fb_fix_screeninfo hitfb_fix __initdata = { 40static struct fb_fix_screeninfo hitfb_fix __devinitdata = {
41 .id = "Hitachi HD64461", 41 .id = "Hitachi HD64461",
42 .type = FB_TYPE_PACKED_PIXELS, 42 .type = FB_TYPE_PACKED_PIXELS,
43 .accel = FB_ACCEL_NONE, 43 .accel = FB_ACCEL_NONE,
@@ -417,7 +417,7 @@ err_fb:
417 return ret; 417 return ret;
418} 418}
419 419
420static int __exit hitfb_remove(struct platform_device *dev) 420static int __devexit hitfb_remove(struct platform_device *dev)
421{ 421{
422 struct fb_info *info = platform_get_drvdata(dev); 422 struct fb_info *info = platform_get_drvdata(dev);
423 423
@@ -462,7 +462,7 @@ static const struct dev_pm_ops hitfb_dev_pm_ops = {
462 462
463static struct platform_driver hitfb_driver = { 463static struct platform_driver hitfb_driver = {
464 .probe = hitfb_probe, 464 .probe = hitfb_probe,
465 .remove = __exit_p(hitfb_remove), 465 .remove = __devexit_p(hitfb_remove),
466 .driver = { 466 .driver = {
467 .name = "hitfb", 467 .name = "hitfb",
468 .owner = THIS_MODULE, 468 .owner = THIS_MODULE,
diff --git a/drivers/video/intelfb/intelfb.h b/drivers/video/intelfb/intelfb.h
index 40984551c927..6b51175629c7 100644
--- a/drivers/video/intelfb/intelfb.h
+++ b/drivers/video/intelfb/intelfb.h
@@ -371,10 +371,6 @@ struct intelfb_info {
371 ((dinfo)->chipset == INTEL_965G) || \ 371 ((dinfo)->chipset == INTEL_965G) || \
372 ((dinfo)->chipset == INTEL_965GM)) 372 ((dinfo)->chipset == INTEL_965GM))
373 373
374#ifndef FBIO_WAITFORVSYNC
375#define FBIO_WAITFORVSYNC _IOW('F', 0x20, __u32)
376#endif
377
378/*** function prototypes ***/ 374/*** function prototypes ***/
379 375
380extern int intelfb_var_to_depth(const struct fb_var_screeninfo *var); 376extern int intelfb_var_to_depth(const struct fb_var_screeninfo *var);
diff --git a/drivers/video/nuc900fb.c b/drivers/video/nuc900fb.c
index 6bf0d460a738..d4cde79ea15e 100644
--- a/drivers/video/nuc900fb.c
+++ b/drivers/video/nuc900fb.c
@@ -667,7 +667,7 @@ release_irq:
667release_regs: 667release_regs:
668 iounmap(fbi->io); 668 iounmap(fbi->io);
669release_mem_region: 669release_mem_region:
670 release_mem_region((unsigned long)fbi->mem, size); 670 release_mem_region(res->start, size);
671free_fb: 671free_fb:
672 framebuffer_release(fbinfo); 672 framebuffer_release(fbinfo);
673 return ret; 673 return ret;
diff --git a/drivers/video/s3c2410fb.c b/drivers/video/s3c2410fb.c
index 2b094dec4a56..46b430978bcc 100644
--- a/drivers/video/s3c2410fb.c
+++ b/drivers/video/s3c2410fb.c
@@ -631,7 +631,7 @@ static struct fb_ops s3c2410fb_ops = {
631 * cache. Once this area is remapped, all virtual memory 631 * cache. Once this area is remapped, all virtual memory
632 * access to the video memory should occur at the new region. 632 * access to the video memory should occur at the new region.
633 */ 633 */
634static int __init s3c2410fb_map_video_memory(struct fb_info *info) 634static int __devinit s3c2410fb_map_video_memory(struct fb_info *info)
635{ 635{
636 struct s3c2410fb_info *fbi = info->par; 636 struct s3c2410fb_info *fbi = info->par;
637 dma_addr_t map_dma; 637 dma_addr_t map_dma;
@@ -814,7 +814,7 @@ static inline void s3c2410fb_cpufreq_deregister(struct s3c2410fb_info *info)
814 814
815static char driver_name[] = "s3c2410fb"; 815static char driver_name[] = "s3c2410fb";
816 816
817static int __init s3c24xxfb_probe(struct platform_device *pdev, 817static int __devinit s3c24xxfb_probe(struct platform_device *pdev,
818 enum s3c_drv_type drv_type) 818 enum s3c_drv_type drv_type)
819{ 819{
820 struct s3c2410fb_info *info; 820 struct s3c2410fb_info *info;
@@ -1018,7 +1018,7 @@ static int __devinit s3c2412fb_probe(struct platform_device *pdev)
1018/* 1018/*
1019 * Cleanup 1019 * Cleanup
1020 */ 1020 */
1021static int s3c2410fb_remove(struct platform_device *pdev) 1021static int __devexit s3c2410fb_remove(struct platform_device *pdev)
1022{ 1022{
1023 struct fb_info *fbinfo = platform_get_drvdata(pdev); 1023 struct fb_info *fbinfo = platform_get_drvdata(pdev);
1024 struct s3c2410fb_info *info = fbinfo->par; 1024 struct s3c2410fb_info *info = fbinfo->par;
@@ -1096,7 +1096,7 @@ static int s3c2410fb_resume(struct platform_device *dev)
1096 1096
1097static struct platform_driver s3c2410fb_driver = { 1097static struct platform_driver s3c2410fb_driver = {
1098 .probe = s3c2410fb_probe, 1098 .probe = s3c2410fb_probe,
1099 .remove = s3c2410fb_remove, 1099 .remove = __devexit_p(s3c2410fb_remove),
1100 .suspend = s3c2410fb_suspend, 1100 .suspend = s3c2410fb_suspend,
1101 .resume = s3c2410fb_resume, 1101 .resume = s3c2410fb_resume,
1102 .driver = { 1102 .driver = {
@@ -1107,7 +1107,7 @@ static struct platform_driver s3c2410fb_driver = {
1107 1107
1108static struct platform_driver s3c2412fb_driver = { 1108static struct platform_driver s3c2412fb_driver = {
1109 .probe = s3c2412fb_probe, 1109 .probe = s3c2412fb_probe,
1110 .remove = s3c2410fb_remove, 1110 .remove = __devexit_p(s3c2410fb_remove),
1111 .suspend = s3c2410fb_suspend, 1111 .suspend = s3c2410fb_suspend,
1112 .resume = s3c2410fb_resume, 1112 .resume = s3c2410fb_resume,
1113 .driver = { 1113 .driver = {
diff --git a/drivers/video/sgivwfb.c b/drivers/video/sgivwfb.c
index 7a3a5e28eca1..53455f295510 100644
--- a/drivers/video/sgivwfb.c
+++ b/drivers/video/sgivwfb.c
@@ -47,7 +47,7 @@ static int ywrap = 0;
47 47
48static int flatpanel_id = -1; 48static int flatpanel_id = -1;
49 49
50static struct fb_fix_screeninfo sgivwfb_fix __initdata = { 50static struct fb_fix_screeninfo sgivwfb_fix __devinitdata = {
51 .id = "SGI Vis WS FB", 51 .id = "SGI Vis WS FB",
52 .type = FB_TYPE_PACKED_PIXELS, 52 .type = FB_TYPE_PACKED_PIXELS,
53 .visual = FB_VISUAL_PSEUDOCOLOR, 53 .visual = FB_VISUAL_PSEUDOCOLOR,
@@ -57,7 +57,7 @@ static struct fb_fix_screeninfo sgivwfb_fix __initdata = {
57 .line_length = 640, 57 .line_length = 640,
58}; 58};
59 59
60static struct fb_var_screeninfo sgivwfb_var __initdata = { 60static struct fb_var_screeninfo sgivwfb_var __devinitdata = {
61 /* 640x480, 8 bpp */ 61 /* 640x480, 8 bpp */
62 .xres = 640, 62 .xres = 640,
63 .yres = 480, 63 .yres = 480,
@@ -79,7 +79,7 @@ static struct fb_var_screeninfo sgivwfb_var __initdata = {
79 .vmode = FB_VMODE_NONINTERLACED 79 .vmode = FB_VMODE_NONINTERLACED
80}; 80};
81 81
82static struct fb_var_screeninfo sgivwfb_var1600sw __initdata = { 82static struct fb_var_screeninfo sgivwfb_var1600sw __devinitdata = {
83 /* 1600x1024, 8 bpp */ 83 /* 1600x1024, 8 bpp */
84 .xres = 1600, 84 .xres = 1600,
85 .yres = 1024, 85 .yres = 1024,
@@ -825,7 +825,7 @@ fail_ioremap_regs:
825 return -ENXIO; 825 return -ENXIO;
826} 826}
827 827
828static int sgivwfb_remove(struct platform_device *dev) 828static int __devexit sgivwfb_remove(struct platform_device *dev)
829{ 829{
830 struct fb_info *info = platform_get_drvdata(dev); 830 struct fb_info *info = platform_get_drvdata(dev);
831 831
@@ -845,7 +845,7 @@ static int sgivwfb_remove(struct platform_device *dev)
845 845
846static struct platform_driver sgivwfb_driver = { 846static struct platform_driver sgivwfb_driver = {
847 .probe = sgivwfb_probe, 847 .probe = sgivwfb_probe,
848 .remove = sgivwfb_remove, 848 .remove = __devexit_p(sgivwfb_remove),
849 .driver = { 849 .driver = {
850 .name = "sgivwfb", 850 .name = "sgivwfb",
851 }, 851 },
diff --git a/drivers/video/sis/sis_main.c b/drivers/video/sis/sis_main.c
index a531a0f7cdf2..559bf1727a2b 100644
--- a/drivers/video/sis/sis_main.c
+++ b/drivers/video/sis/sis_main.c
@@ -1845,7 +1845,7 @@ sisfb_get_fix(struct fb_fix_screeninfo *fix, int con, struct fb_info *info)
1845 1845
1846 memset(fix, 0, sizeof(struct fb_fix_screeninfo)); 1846 memset(fix, 0, sizeof(struct fb_fix_screeninfo));
1847 1847
1848 strcpy(fix->id, ivideo->myid); 1848 strlcpy(fix->id, ivideo->myid, sizeof(fix->id));
1849 1849
1850 mutex_lock(&info->mm_lock); 1850 mutex_lock(&info->mm_lock);
1851 fix->smem_start = ivideo->video_base + ivideo->video_offset; 1851 fix->smem_start = ivideo->video_base + ivideo->video_offset;
diff --git a/drivers/video/vfb.c b/drivers/video/vfb.c
index 9b5532b4de35..bc67251f1a2f 100644
--- a/drivers/video/vfb.c
+++ b/drivers/video/vfb.c
@@ -78,7 +78,7 @@ static void rvfree(void *mem, unsigned long size)
78 vfree(mem); 78 vfree(mem);
79} 79}
80 80
81static struct fb_var_screeninfo vfb_default __initdata = { 81static struct fb_var_screeninfo vfb_default __devinitdata = {
82 .xres = 640, 82 .xres = 640,
83 .yres = 480, 83 .yres = 480,
84 .xres_virtual = 640, 84 .xres_virtual = 640,
@@ -100,7 +100,7 @@ static struct fb_var_screeninfo vfb_default __initdata = {
100 .vmode = FB_VMODE_NONINTERLACED, 100 .vmode = FB_VMODE_NONINTERLACED,
101}; 101};
102 102
103static struct fb_fix_screeninfo vfb_fix __initdata = { 103static struct fb_fix_screeninfo vfb_fix __devinitdata = {
104 .id = "Virtual FB", 104 .id = "Virtual FB",
105 .type = FB_TYPE_PACKED_PIXELS, 105 .type = FB_TYPE_PACKED_PIXELS,
106 .visual = FB_VISUAL_PSEUDOCOLOR, 106 .visual = FB_VISUAL_PSEUDOCOLOR,
diff --git a/drivers/video/vga16fb.c b/drivers/video/vga16fb.c
index 149c47ac7e93..28ccab44a391 100644
--- a/drivers/video/vga16fb.c
+++ b/drivers/video/vga16fb.c
@@ -65,7 +65,7 @@ struct vga16fb_par {
65 65
66/* --------------------------------------------------------------------- */ 66/* --------------------------------------------------------------------- */
67 67
68static struct fb_var_screeninfo vga16fb_defined __initdata = { 68static struct fb_var_screeninfo vga16fb_defined __devinitdata = {
69 .xres = 640, 69 .xres = 640,
70 .yres = 480, 70 .yres = 480,
71 .xres_virtual = 640, 71 .xres_virtual = 640,
@@ -85,7 +85,7 @@ static struct fb_var_screeninfo vga16fb_defined __initdata = {
85}; 85};
86 86
87/* name should not depend on EGA/VGA */ 87/* name should not depend on EGA/VGA */
88static struct fb_fix_screeninfo vga16fb_fix __initdata = { 88static struct fb_fix_screeninfo vga16fb_fix __devinitdata = {
89 .id = "VGA16 VGA", 89 .id = "VGA16 VGA",
90 .smem_start = VGA_FB_PHYS, 90 .smem_start = VGA_FB_PHYS,
91 .smem_len = VGA_FB_PHYS_LEN, 91 .smem_len = VGA_FB_PHYS_LEN,
@@ -1287,7 +1287,7 @@ static struct fb_ops vga16fb_ops = {
1287}; 1287};
1288 1288
1289#ifndef MODULE 1289#ifndef MODULE
1290static int vga16fb_setup(char *options) 1290static int __init vga16fb_setup(char *options)
1291{ 1291{
1292 char *this_opt; 1292 char *this_opt;
1293 1293
@@ -1393,7 +1393,7 @@ static int __devinit vga16fb_probe(struct platform_device *dev)
1393 return ret; 1393 return ret;
1394} 1394}
1395 1395
1396static int vga16fb_remove(struct platform_device *dev) 1396static int __devexit vga16fb_remove(struct platform_device *dev)
1397{ 1397{
1398 struct fb_info *info = platform_get_drvdata(dev); 1398 struct fb_info *info = platform_get_drvdata(dev);
1399 1399
@@ -1405,7 +1405,7 @@ static int vga16fb_remove(struct platform_device *dev)
1405 1405
1406static struct platform_driver vga16fb_driver = { 1406static struct platform_driver vga16fb_driver = {
1407 .probe = vga16fb_probe, 1407 .probe = vga16fb_probe,
1408 .remove = vga16fb_remove, 1408 .remove = __devexit_p(vga16fb_remove),
1409 .driver = { 1409 .driver = {
1410 .name = "vga16fb", 1410 .name = "vga16fb",
1411 }, 1411 },
diff --git a/drivers/video/w100fb.c b/drivers/video/w100fb.c
index 31b0e17ed090..e66b8b19ce5d 100644
--- a/drivers/video/w100fb.c
+++ b/drivers/video/w100fb.c
@@ -53,7 +53,7 @@ static void w100_update_enable(void);
53static void w100_update_disable(void); 53static void w100_update_disable(void);
54static void calc_hsync(struct w100fb_par *par); 54static void calc_hsync(struct w100fb_par *par);
55static void w100_init_graphic_engine(struct w100fb_par *par); 55static void w100_init_graphic_engine(struct w100fb_par *par);
56struct w100_pll_info *w100_get_xtal_table(unsigned int freq); 56struct w100_pll_info *w100_get_xtal_table(unsigned int freq) __devinit;
57 57
58/* Pseudo palette size */ 58/* Pseudo palette size */
59#define MAX_PALETTES 16 59#define MAX_PALETTES 16
@@ -782,7 +782,7 @@ out:
782} 782}
783 783
784 784
785static int w100fb_remove(struct platform_device *pdev) 785static int __devexit w100fb_remove(struct platform_device *pdev)
786{ 786{
787 struct fb_info *info = platform_get_drvdata(pdev); 787 struct fb_info *info = platform_get_drvdata(pdev);
788 struct w100fb_par *par=info->par; 788 struct w100fb_par *par=info->par;
@@ -1020,7 +1020,7 @@ static struct pll_entries {
1020 { 0 }, 1020 { 0 },
1021}; 1021};
1022 1022
1023struct w100_pll_info *w100_get_xtal_table(unsigned int freq) 1023struct w100_pll_info __devinit *w100_get_xtal_table(unsigned int freq)
1024{ 1024{
1025 struct pll_entries *pll_entry = w100_pll_tables; 1025 struct pll_entries *pll_entry = w100_pll_tables;
1026 1026
@@ -1611,7 +1611,7 @@ static void w100_vsync(void)
1611 1611
1612static struct platform_driver w100fb_driver = { 1612static struct platform_driver w100fb_driver = {
1613 .probe = w100fb_probe, 1613 .probe = w100fb_probe,
1614 .remove = w100fb_remove, 1614 .remove = __devexit_p(w100fb_remove),
1615 .suspend = w100fb_suspend, 1615 .suspend = w100fb_suspend,
1616 .resume = w100fb_resume, 1616 .resume = w100fb_resume,
1617 .driver = { 1617 .driver = {
@@ -1619,7 +1619,7 @@ static struct platform_driver w100fb_driver = {
1619 }, 1619 },
1620}; 1620};
1621 1621
1622int __devinit w100fb_init(void) 1622int __init w100fb_init(void)
1623{ 1623{
1624 return platform_driver_register(&w100fb_driver); 1624 return platform_driver_register(&w100fb_driver);
1625} 1625}
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index b87ba23442d2..afcfacc9bbe2 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -145,13 +145,19 @@ config KS8695_WATCHDOG
145 Watchdog timer embedded into KS8695 processor. This will reboot your 145 Watchdog timer embedded into KS8695 processor. This will reboot your
146 system when the timeout is reached. 146 system when the timeout is reached.
147 147
148config HAVE_S3C2410_WATCHDOG
149 bool
150 help
151 This will include watchdog timer support for Samsung SoCs. If
152 you want to include watchdog support for any machine, kindly
153 select this in the respective mach-XXXX/Kconfig file.
154
148config S3C2410_WATCHDOG 155config S3C2410_WATCHDOG
149 tristate "S3C2410 Watchdog" 156 tristate "S3C2410 Watchdog"
150 depends on ARCH_S3C2410 157 depends on ARCH_S3C2410 || HAVE_S3C2410_WATCHDOG
151 help 158 help
152 Watchdog timer block in the Samsung S3C2410 chips. This will 159 Watchdog timer block in the Samsung SoCs. This will reboot
153 reboot the system when the timer expires with the watchdog 160 the system when the timer expires with the watchdog enabled.
154 enabled.
155 161
156 The driver is limited by the speed of the system's PCLK 162 The driver is limited by the speed of the system's PCLK
157 signal, so with reasonably fast systems (PCLK around 50-66MHz) 163 signal, so with reasonably fast systems (PCLK around 50-66MHz)
@@ -306,6 +312,18 @@ config MAX63XX_WATCHDOG
306 help 312 help
307 Support for memory mapped max63{69,70,71,72,73,74} watchdog timer. 313 Support for memory mapped max63{69,70,71,72,73,74} watchdog timer.
308 314
315config IMX2_WDT
316 tristate "IMX2+ Watchdog"
317 depends on ARCH_MX2 || ARCH_MX25 || ARCH_MX3 || ARCH_MX5
318 help
319 This is the driver for the hardware watchdog
320 on the Freescale IMX2 and later processors.
321 If you have one of these processors and wish to have
322 watchdog support enabled, say Y, otherwise say N.
323
324 To compile this driver as a module, choose M here: the
325 module will be called imx2_wdt.
326
309# AVR32 Architecture 327# AVR32 Architecture
310 328
311config AT32AP700X_WDT 329config AT32AP700X_WDT
diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
index 5e3cb95bb0e9..72f3e2073f8e 100644
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_STMP3XXX_WATCHDOG) += stmp3xxx_wdt.o
47obj-$(CONFIG_NUC900_WATCHDOG) += nuc900_wdt.o 47obj-$(CONFIG_NUC900_WATCHDOG) += nuc900_wdt.o
48obj-$(CONFIG_ADX_WATCHDOG) += adx_wdt.o 48obj-$(CONFIG_ADX_WATCHDOG) += adx_wdt.o
49obj-$(CONFIG_TS72XX_WATCHDOG) += ts72xx_wdt.o 49obj-$(CONFIG_TS72XX_WATCHDOG) += ts72xx_wdt.o
50obj-$(CONFIG_IMX2_WDT) += imx2_wdt.o
50 51
51# AVR32 Architecture 52# AVR32 Architecture
52obj-$(CONFIG_AT32AP700X_WDT) += at32ap700x_wdt.o 53obj-$(CONFIG_AT32AP700X_WDT) += at32ap700x_wdt.o
diff --git a/drivers/watchdog/bfin_wdt.c b/drivers/watchdog/bfin_wdt.c
index 9c7ccd1e9088..9042a95fc98c 100644
--- a/drivers/watchdog/bfin_wdt.c
+++ b/drivers/watchdog/bfin_wdt.c
@@ -23,6 +23,7 @@
23#include <linux/interrupt.h> 23#include <linux/interrupt.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <asm/blackfin.h> 25#include <asm/blackfin.h>
26#include <asm/bfin_watchdog.h>
26 27
27#define stamp(fmt, args...) \ 28#define stamp(fmt, args...) \
28 pr_debug("%s:%i: " fmt "\n", __func__, __LINE__, ## args) 29 pr_debug("%s:%i: " fmt "\n", __func__, __LINE__, ## args)
@@ -49,24 +50,6 @@
49# define bfin_write_WDOG_STAT(x) bfin_write_WDOGA_STAT(x) 50# define bfin_write_WDOG_STAT(x) bfin_write_WDOGA_STAT(x)
50#endif 51#endif
51 52
52/* Bit in SWRST that indicates boot caused by watchdog */
53#define SWRST_RESET_WDOG 0x4000
54
55/* Bit in WDOG_CTL that indicates watchdog has expired (WDR0) */
56#define WDOG_EXPIRED 0x8000
57
58/* Masks for WDEV field in WDOG_CTL register */
59#define ICTL_RESET 0x0
60#define ICTL_NMI 0x2
61#define ICTL_GPI 0x4
62#define ICTL_NONE 0x6
63#define ICTL_MASK 0x6
64
65/* Masks for WDEN field in WDOG_CTL register */
66#define WDEN_MASK 0x0FF0
67#define WDEN_ENABLE 0x0000
68#define WDEN_DISABLE 0x0AD0
69
70/* some defaults */ 53/* some defaults */
71#define WATCHDOG_TIMEOUT 20 54#define WATCHDOG_TIMEOUT 20
72 55
diff --git a/drivers/watchdog/booke_wdt.c b/drivers/watchdog/booke_wdt.c
index 801ead191499..3d49671cdf5a 100644
--- a/drivers/watchdog/booke_wdt.c
+++ b/drivers/watchdog/booke_wdt.c
@@ -137,12 +137,12 @@ static long booke_wdt_ioctl(struct file *file,
137 if (copy_to_user((void *)arg, &ident, sizeof(ident))) 137 if (copy_to_user((void *)arg, &ident, sizeof(ident)))
138 return -EFAULT; 138 return -EFAULT;
139 case WDIOC_GETSTATUS: 139 case WDIOC_GETSTATUS:
140 return put_user(ident.options, p); 140 return put_user(0, p);
141 case WDIOC_GETBOOTSTATUS: 141 case WDIOC_GETBOOTSTATUS:
142 /* XXX: something is clearing TSR */ 142 /* XXX: something is clearing TSR */
143 tmp = mfspr(SPRN_TSR) & TSR_WRS(3); 143 tmp = mfspr(SPRN_TSR) & TSR_WRS(3);
144 /* returns 1 if last reset was caused by the WDT */ 144 /* returns CARDRESET if last reset was caused by the WDT */
145 return (tmp ? 1 : 0); 145 return (tmp ? WDIOF_CARDRESET : 0);
146 case WDIOC_SETOPTIONS: 146 case WDIOC_SETOPTIONS:
147 if (get_user(tmp, p)) 147 if (get_user(tmp, p))
148 return -EINVAL; 148 return -EINVAL;
diff --git a/drivers/watchdog/eurotechwdt.c b/drivers/watchdog/eurotechwdt.c
index d1c4e55b1db0..3f3dc093ad68 100644
--- a/drivers/watchdog/eurotechwdt.c
+++ b/drivers/watchdog/eurotechwdt.c
@@ -68,7 +68,6 @@ static spinlock_t eurwdt_lock;
68 68
69/* 69/*
70 * You must set these - there is no sane way to probe for this board. 70 * You must set these - there is no sane way to probe for this board.
71 * You can use eurwdt=x,y to set these now.
72 */ 71 */
73 72
74static int io = 0x3f0; 73static int io = 0x3f0;
diff --git a/drivers/watchdog/iTCO_vendor_support.c b/drivers/watchdog/iTCO_vendor_support.c
index 5133bca5ccbe..481d1ad43464 100644
--- a/drivers/watchdog/iTCO_vendor_support.c
+++ b/drivers/watchdog/iTCO_vendor_support.c
@@ -101,13 +101,6 @@ static void supermicro_old_pre_stop(unsigned long acpibase)
101 outl(val32, SMI_EN); /* Needed to deactivate watchdog */ 101 outl(val32, SMI_EN); /* Needed to deactivate watchdog */
102} 102}
103 103
104static void supermicro_old_pre_keepalive(unsigned long acpibase)
105{
106 /* Reload TCO Timer (done in iTCO_wdt_keepalive) + */
107 /* Clear "Expire Flag" (Bit 3 of TC01_STS register) */
108 outb(0x08, TCO1_STS);
109}
110
111/* 104/*
112 * Vendor Support: 2 105 * Vendor Support: 2
113 * Board: Super Micro Computer Inc. P4SBx, P4DPx 106 * Board: Super Micro Computer Inc. P4SBx, P4DPx
@@ -337,9 +330,7 @@ EXPORT_SYMBOL(iTCO_vendor_pre_stop);
337 330
338void iTCO_vendor_pre_keepalive(unsigned long acpibase, unsigned int heartbeat) 331void iTCO_vendor_pre_keepalive(unsigned long acpibase, unsigned int heartbeat)
339{ 332{
340 if (vendorsupport == SUPERMICRO_OLD_BOARD) 333 if (vendorsupport == SUPERMICRO_NEW_BOARD)
341 supermicro_old_pre_keepalive(acpibase);
342 else if (vendorsupport == SUPERMICRO_NEW_BOARD)
343 supermicro_new_pre_set_heartbeat(heartbeat); 334 supermicro_new_pre_set_heartbeat(heartbeat);
344} 335}
345EXPORT_SYMBOL(iTCO_vendor_pre_keepalive); 336EXPORT_SYMBOL(iTCO_vendor_pre_keepalive);
diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c
index 8da886035374..69de8713b8e4 100644
--- a/drivers/watchdog/iTCO_wdt.c
+++ b/drivers/watchdog/iTCO_wdt.c
@@ -40,7 +40,7 @@
40 40
41/* Module and version information */ 41/* Module and version information */
42#define DRV_NAME "iTCO_wdt" 42#define DRV_NAME "iTCO_wdt"
43#define DRV_VERSION "1.05" 43#define DRV_VERSION "1.06"
44#define PFX DRV_NAME ": " 44#define PFX DRV_NAME ": "
45 45
46/* Includes */ 46/* Includes */
@@ -391,8 +391,8 @@ static struct platform_device *iTCO_wdt_platform_device;
391#define WATCHDOG_HEARTBEAT 30 /* 30 sec default heartbeat */ 391#define WATCHDOG_HEARTBEAT 30 /* 30 sec default heartbeat */
392static int heartbeat = WATCHDOG_HEARTBEAT; /* in seconds */ 392static int heartbeat = WATCHDOG_HEARTBEAT; /* in seconds */
393module_param(heartbeat, int, 0); 393module_param(heartbeat, int, 0);
394MODULE_PARM_DESC(heartbeat, "Watchdog heartbeat in seconds. " 394MODULE_PARM_DESC(heartbeat, "Watchdog timeout in seconds. "
395 "(2<heartbeat<39 (TCO v1) or 613 (TCO v2), default=" 395 "5..76 (TCO v1) or 3..614 (TCO v2), default="
396 __MODULE_STRING(WATCHDOG_HEARTBEAT) ")"); 396 __MODULE_STRING(WATCHDOG_HEARTBEAT) ")");
397 397
398static int nowayout = WATCHDOG_NOWAYOUT; 398static int nowayout = WATCHDOG_NOWAYOUT;
@@ -523,8 +523,13 @@ static int iTCO_wdt_keepalive(void)
523 /* Reload the timer by writing to the TCO Timer Counter register */ 523 /* Reload the timer by writing to the TCO Timer Counter register */
524 if (iTCO_wdt_private.iTCO_version == 2) 524 if (iTCO_wdt_private.iTCO_version == 2)
525 outw(0x01, TCO_RLD); 525 outw(0x01, TCO_RLD);
526 else if (iTCO_wdt_private.iTCO_version == 1) 526 else if (iTCO_wdt_private.iTCO_version == 1) {
527 /* Reset the timeout status bit so that the timer
528 * needs to count down twice again before rebooting */
529 outw(0x0008, TCO1_STS); /* write 1 to clear bit */
530
527 outb(0x01, TCO_RLD); 531 outb(0x01, TCO_RLD);
532 }
528 533
529 spin_unlock(&iTCO_wdt_private.io_lock); 534 spin_unlock(&iTCO_wdt_private.io_lock);
530 return 0; 535 return 0;
@@ -537,6 +542,11 @@ static int iTCO_wdt_set_heartbeat(int t)
537 unsigned int tmrval; 542 unsigned int tmrval;
538 543
539 tmrval = seconds_to_ticks(t); 544 tmrval = seconds_to_ticks(t);
545
546 /* For TCO v1 the timer counts down twice before rebooting */
547 if (iTCO_wdt_private.iTCO_version == 1)
548 tmrval /= 2;
549
540 /* from the specs: */ 550 /* from the specs: */
541 /* "Values of 0h-3h are ignored and should not be attempted" */ 551 /* "Values of 0h-3h are ignored and should not be attempted" */
542 if (tmrval < 0x04) 552 if (tmrval < 0x04)
@@ -593,6 +603,8 @@ static int iTCO_wdt_get_timeleft(int *time_left)
593 spin_lock(&iTCO_wdt_private.io_lock); 603 spin_lock(&iTCO_wdt_private.io_lock);
594 val8 = inb(TCO_RLD); 604 val8 = inb(TCO_RLD);
595 val8 &= 0x3f; 605 val8 &= 0x3f;
606 if (!(inw(TCO1_STS) & 0x0008))
607 val8 += (inb(TCOv1_TMR) & 0x3f);
596 spin_unlock(&iTCO_wdt_private.io_lock); 608 spin_unlock(&iTCO_wdt_private.io_lock);
597 609
598 *time_left = (val8 * 6) / 10; 610 *time_left = (val8 * 6) / 10;
@@ -832,9 +844,9 @@ static int __devinit iTCO_wdt_init(struct pci_dev *pdev,
832 TCOBASE); 844 TCOBASE);
833 845
834 /* Clear out the (probably old) status */ 846 /* Clear out the (probably old) status */
835 outb(8, TCO1_STS); /* Clear the Time Out Status bit */ 847 outw(0x0008, TCO1_STS); /* Clear the Time Out Status bit */
836 outb(2, TCO2_STS); /* Clear SECOND_TO_STS bit */ 848 outw(0x0002, TCO2_STS); /* Clear SECOND_TO_STS bit */
837 outb(4, TCO2_STS); /* Clear BOOT_STS bit */ 849 outw(0x0004, TCO2_STS); /* Clear BOOT_STS bit */
838 850
839 /* Make sure the watchdog is not running */ 851 /* Make sure the watchdog is not running */
840 iTCO_wdt_stop(); 852 iTCO_wdt_stop();
@@ -844,8 +856,7 @@ static int __devinit iTCO_wdt_init(struct pci_dev *pdev,
844 if (iTCO_wdt_set_heartbeat(heartbeat)) { 856 if (iTCO_wdt_set_heartbeat(heartbeat)) {
845 iTCO_wdt_set_heartbeat(WATCHDOG_HEARTBEAT); 857 iTCO_wdt_set_heartbeat(WATCHDOG_HEARTBEAT);
846 printk(KERN_INFO PFX 858 printk(KERN_INFO PFX
847 "heartbeat value must be 2 < heartbeat < 39 (TCO v1) " 859 "timeout value out of range, using %d\n", heartbeat);
848 "or 613 (TCO v2), using %d\n", heartbeat);
849 } 860 }
850 861
851 ret = misc_register(&iTCO_wdt_miscdev); 862 ret = misc_register(&iTCO_wdt_miscdev);
diff --git a/drivers/watchdog/imx2_wdt.c b/drivers/watchdog/imx2_wdt.c
new file mode 100644
index 000000000000..ea25885781bb
--- /dev/null
+++ b/drivers/watchdog/imx2_wdt.c
@@ -0,0 +1,358 @@
1/*
2 * Watchdog driver for IMX2 and later processors
3 *
4 * Copyright (C) 2010 Wolfram Sang, Pengutronix e.K. <w.sang@pengutronix.de>
5 *
6 * some parts adapted by similar drivers from Darius Augulis and Vladimir
7 * Zapolskiy, additional improvements by Wim Van Sebroeck.
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU General Public License version 2 as published by
11 * the Free Software Foundation.
12 *
13 * NOTE: MX1 has a slightly different Watchdog than MX2 and later:
14 *
15 * MX1: MX2+:
16 * ---- -----
17 * Registers: 32-bit 16-bit
18 * Stopable timer: Yes No
19 * Need to enable clk: No Yes
20 * Halt on suspend: Manual Can be automatic
21 */
22
23#include <linux/init.h>
24#include <linux/kernel.h>
25#include <linux/miscdevice.h>
26#include <linux/module.h>
27#include <linux/moduleparam.h>
28#include <linux/platform_device.h>
29#include <linux/watchdog.h>
30#include <linux/clk.h>
31#include <linux/fs.h>
32#include <linux/io.h>
33#include <linux/uaccess.h>
34#include <linux/timer.h>
35#include <linux/jiffies.h>
36#include <mach/hardware.h>
37
38#define DRIVER_NAME "imx2-wdt"
39
40#define IMX2_WDT_WCR 0x00 /* Control Register */
41#define IMX2_WDT_WCR_WT (0xFF << 8) /* -> Watchdog Timeout Field */
42#define IMX2_WDT_WCR_WRE (1 << 3) /* -> WDOG Reset Enable */
43#define IMX2_WDT_WCR_WDE (1 << 2) /* -> Watchdog Enable */
44
45#define IMX2_WDT_WSR 0x02 /* Service Register */
46#define IMX2_WDT_SEQ1 0x5555 /* -> service sequence 1 */
47#define IMX2_WDT_SEQ2 0xAAAA /* -> service sequence 2 */
48
49#define IMX2_WDT_MAX_TIME 128
50#define IMX2_WDT_DEFAULT_TIME 60 /* in seconds */
51
52#define WDOG_SEC_TO_COUNT(s) ((s * 2 - 1) << 8)
53
54#define IMX2_WDT_STATUS_OPEN 0
55#define IMX2_WDT_STATUS_STARTED 1
56#define IMX2_WDT_EXPECT_CLOSE 2
57
58static struct {
59 struct clk *clk;
60 void __iomem *base;
61 unsigned timeout;
62 unsigned long status;
63 struct timer_list timer; /* Pings the watchdog when closed */
64} imx2_wdt;
65
66static struct miscdevice imx2_wdt_miscdev;
67
68static int nowayout = WATCHDOG_NOWAYOUT;
69module_param(nowayout, int, 0);
70MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
71 __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
72
73
74static unsigned timeout = IMX2_WDT_DEFAULT_TIME;
75module_param(timeout, uint, 0);
76MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds (default="
77 __MODULE_STRING(IMX2_WDT_DEFAULT_TIME) ")");
78
79static const struct watchdog_info imx2_wdt_info = {
80 .identity = "imx2+ watchdog",
81 .options = WDIOF_KEEPALIVEPING | WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE,
82};
83
84static inline void imx2_wdt_setup(void)
85{
86 u16 val = __raw_readw(imx2_wdt.base + IMX2_WDT_WCR);
87
88 /* Strip the old watchdog Time-Out value */
89 val &= ~IMX2_WDT_WCR_WT;
90 /* Generate reset if WDOG times out */
91 val &= ~IMX2_WDT_WCR_WRE;
92 /* Keep Watchdog Disabled */
93 val &= ~IMX2_WDT_WCR_WDE;
94 /* Set the watchdog's Time-Out value */
95 val |= WDOG_SEC_TO_COUNT(imx2_wdt.timeout);
96
97 __raw_writew(val, imx2_wdt.base + IMX2_WDT_WCR);
98
99 /* enable the watchdog */
100 val |= IMX2_WDT_WCR_WDE;
101 __raw_writew(val, imx2_wdt.base + IMX2_WDT_WCR);
102}
103
104static inline void imx2_wdt_ping(void)
105{
106 __raw_writew(IMX2_WDT_SEQ1, imx2_wdt.base + IMX2_WDT_WSR);
107 __raw_writew(IMX2_WDT_SEQ2, imx2_wdt.base + IMX2_WDT_WSR);
108}
109
110static void imx2_wdt_timer_ping(unsigned long arg)
111{
112 /* ping it every imx2_wdt.timeout / 2 seconds to prevent reboot */
113 imx2_wdt_ping();
114 mod_timer(&imx2_wdt.timer, jiffies + imx2_wdt.timeout * HZ / 2);
115}
116
117static void imx2_wdt_start(void)
118{
119 if (!test_and_set_bit(IMX2_WDT_STATUS_STARTED, &imx2_wdt.status)) {
120 /* at our first start we enable clock and do initialisations */
121 clk_enable(imx2_wdt.clk);
122
123 imx2_wdt_setup();
124 } else /* delete the timer that pings the watchdog after close */
125 del_timer_sync(&imx2_wdt.timer);
126
127 /* Watchdog is enabled - time to reload the timeout value */
128 imx2_wdt_ping();
129}
130
131static void imx2_wdt_stop(void)
132{
133 /* we don't need a clk_disable, it cannot be disabled once started.
134 * We use a timer to ping the watchdog while /dev/watchdog is closed */
135 imx2_wdt_timer_ping(0);
136}
137
138static void imx2_wdt_set_timeout(int new_timeout)
139{
140 u16 val = __raw_readw(imx2_wdt.base + IMX2_WDT_WCR);
141
142 /* set the new timeout value in the WSR */
143 val &= ~IMX2_WDT_WCR_WT;
144 val |= WDOG_SEC_TO_COUNT(new_timeout);
145 __raw_writew(val, imx2_wdt.base + IMX2_WDT_WCR);
146}
147
148static int imx2_wdt_open(struct inode *inode, struct file *file)
149{
150 if (test_and_set_bit(IMX2_WDT_STATUS_OPEN, &imx2_wdt.status))
151 return -EBUSY;
152
153 imx2_wdt_start();
154 return nonseekable_open(inode, file);
155}
156
157static int imx2_wdt_close(struct inode *inode, struct file *file)
158{
159 if (test_bit(IMX2_WDT_EXPECT_CLOSE, &imx2_wdt.status) && !nowayout)
160 imx2_wdt_stop();
161 else {
162 dev_crit(imx2_wdt_miscdev.parent,
163 "Unexpected close: Expect reboot!\n");
164 imx2_wdt_ping();
165 }
166
167 clear_bit(IMX2_WDT_EXPECT_CLOSE, &imx2_wdt.status);
168 clear_bit(IMX2_WDT_STATUS_OPEN, &imx2_wdt.status);
169 return 0;
170}
171
172static long imx2_wdt_ioctl(struct file *file, unsigned int cmd,
173 unsigned long arg)
174{
175 void __user *argp = (void __user *)arg;
176 int __user *p = argp;
177 int new_value;
178
179 switch (cmd) {
180 case WDIOC_GETSUPPORT:
181 return copy_to_user(argp, &imx2_wdt_info,
182 sizeof(struct watchdog_info)) ? -EFAULT : 0;
183
184 case WDIOC_GETSTATUS:
185 case WDIOC_GETBOOTSTATUS:
186 return put_user(0, p);
187
188 case WDIOC_KEEPALIVE:
189 imx2_wdt_ping();
190 return 0;
191
192 case WDIOC_SETTIMEOUT:
193 if (get_user(new_value, p))
194 return -EFAULT;
195 if ((new_value < 1) || (new_value > IMX2_WDT_MAX_TIME))
196 return -EINVAL;
197 imx2_wdt_set_timeout(new_value);
198 imx2_wdt.timeout = new_value;
199 imx2_wdt_ping();
200
201 /* Fallthrough to return current value */
202 case WDIOC_GETTIMEOUT:
203 return put_user(imx2_wdt.timeout, p);
204
205 default:
206 return -ENOTTY;
207 }
208}
209
210static ssize_t imx2_wdt_write(struct file *file, const char __user *data,
211 size_t len, loff_t *ppos)
212{
213 size_t i;
214 char c;
215
216 if (len == 0) /* Can we see this even ? */
217 return 0;
218
219 clear_bit(IMX2_WDT_EXPECT_CLOSE, &imx2_wdt.status);
220 /* scan to see whether or not we got the magic character */
221 for (i = 0; i != len; i++) {
222 if (get_user(c, data + i))
223 return -EFAULT;
224 if (c == 'V')
225 set_bit(IMX2_WDT_EXPECT_CLOSE, &imx2_wdt.status);
226 }
227
228 imx2_wdt_ping();
229 return len;
230}
231
232static const struct file_operations imx2_wdt_fops = {
233 .owner = THIS_MODULE,
234 .llseek = no_llseek,
235 .unlocked_ioctl = imx2_wdt_ioctl,
236 .open = imx2_wdt_open,
237 .release = imx2_wdt_close,
238 .write = imx2_wdt_write,
239};
240
241static struct miscdevice imx2_wdt_miscdev = {
242 .minor = WATCHDOG_MINOR,
243 .name = "watchdog",
244 .fops = &imx2_wdt_fops,
245};
246
247static int __init imx2_wdt_probe(struct platform_device *pdev)
248{
249 int ret;
250 int res_size;
251 struct resource *res;
252
253 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
254 if (!res) {
255 dev_err(&pdev->dev, "can't get device resources\n");
256 return -ENODEV;
257 }
258
259 res_size = resource_size(res);
260 if (!devm_request_mem_region(&pdev->dev, res->start, res_size,
261 res->name)) {
262 dev_err(&pdev->dev, "can't allocate %d bytes at %d address\n",
263 res_size, res->start);
264 return -ENOMEM;
265 }
266
267 imx2_wdt.base = devm_ioremap_nocache(&pdev->dev, res->start, res_size);
268 if (!imx2_wdt.base) {
269 dev_err(&pdev->dev, "ioremap failed\n");
270 return -ENOMEM;
271 }
272
273 imx2_wdt.clk = clk_get_sys("imx-wdt.0", NULL);
274 if (IS_ERR(imx2_wdt.clk)) {
275 dev_err(&pdev->dev, "can't get Watchdog clock\n");
276 return PTR_ERR(imx2_wdt.clk);
277 }
278
279 imx2_wdt.timeout = clamp_t(unsigned, timeout, 1, IMX2_WDT_MAX_TIME);
280 if (imx2_wdt.timeout != timeout)
281 dev_warn(&pdev->dev, "Initial timeout out of range! "
282 "Clamped from %u to %u\n", timeout, imx2_wdt.timeout);
283
284 setup_timer(&imx2_wdt.timer, imx2_wdt_timer_ping, 0);
285
286 imx2_wdt_miscdev.parent = &pdev->dev;
287 ret = misc_register(&imx2_wdt_miscdev);
288 if (ret)
289 goto fail;
290
291 dev_info(&pdev->dev,
292 "IMX2+ Watchdog Timer enabled. timeout=%ds (nowayout=%d)\n",
293 imx2_wdt.timeout, nowayout);
294 return 0;
295
296fail:
297 imx2_wdt_miscdev.parent = NULL;
298 clk_put(imx2_wdt.clk);
299 return ret;
300}
301
302static int __exit imx2_wdt_remove(struct platform_device *pdev)
303{
304 misc_deregister(&imx2_wdt_miscdev);
305
306 if (test_bit(IMX2_WDT_STATUS_STARTED, &imx2_wdt.status)) {
307 del_timer_sync(&imx2_wdt.timer);
308
309 dev_crit(imx2_wdt_miscdev.parent,
310 "Device removed: Expect reboot!\n");
311 } else
312 clk_put(imx2_wdt.clk);
313
314 imx2_wdt_miscdev.parent = NULL;
315 return 0;
316}
317
318static void imx2_wdt_shutdown(struct platform_device *pdev)
319{
320 if (test_bit(IMX2_WDT_STATUS_STARTED, &imx2_wdt.status)) {
321 /* we are running, we need to delete the timer but will give
322 * max timeout before reboot will take place */
323 del_timer_sync(&imx2_wdt.timer);
324 imx2_wdt_set_timeout(IMX2_WDT_MAX_TIME);
325 imx2_wdt_ping();
326
327 dev_crit(imx2_wdt_miscdev.parent,
328 "Device shutdown: Expect reboot!\n");
329 }
330}
331
332static struct platform_driver imx2_wdt_driver = {
333 .probe = imx2_wdt_probe,
334 .remove = __exit_p(imx2_wdt_remove),
335 .shutdown = imx2_wdt_shutdown,
336 .driver = {
337 .name = DRIVER_NAME,
338 .owner = THIS_MODULE,
339 },
340};
341
342static int __init imx2_wdt_init(void)
343{
344 return platform_driver_probe(&imx2_wdt_driver, imx2_wdt_probe);
345}
346module_init(imx2_wdt_init);
347
348static void __exit imx2_wdt_exit(void)
349{
350 platform_driver_unregister(&imx2_wdt_driver);
351}
352module_exit(imx2_wdt_exit);
353
354MODULE_AUTHOR("Wolfram Sang");
355MODULE_DESCRIPTION("Watchdog driver for IMX2 and later");
356MODULE_LICENSE("GPL v2");
357MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
358MODULE_ALIAS("platform:" DRIVER_NAME);
diff --git a/drivers/watchdog/mpc8xxx_wdt.c b/drivers/watchdog/mpc8xxx_wdt.c
index 7b55974191dd..6622335773bb 100644
--- a/drivers/watchdog/mpc8xxx_wdt.c
+++ b/drivers/watchdog/mpc8xxx_wdt.c
@@ -53,7 +53,7 @@ static int mpc8xxx_wdt_init_late(void);
53static u16 timeout = 0xffff; 53static u16 timeout = 0xffff;
54module_param(timeout, ushort, 0); 54module_param(timeout, ushort, 0);
55MODULE_PARM_DESC(timeout, 55MODULE_PARM_DESC(timeout,
56 "Watchdog timeout in ticks. (0<timeout<65536, default=65535"); 56 "Watchdog timeout in ticks. (0<timeout<65536, default=65535)");
57 57
58static int reset = 1; 58static int reset = 1;
59module_param(reset, bool, 0); 59module_param(reset, bool, 0);
diff --git a/drivers/watchdog/pc87413_wdt.c b/drivers/watchdog/pc87413_wdt.c
index d3aa2f1fe61d..3a56bc360924 100644
--- a/drivers/watchdog/pc87413_wdt.c
+++ b/drivers/watchdog/pc87413_wdt.c
@@ -53,7 +53,9 @@
53#define WDTO 0x11 /* Watchdog timeout register */ 53#define WDTO 0x11 /* Watchdog timeout register */
54#define WDCFG 0x12 /* Watchdog config register */ 54#define WDCFG 0x12 /* Watchdog config register */
55 55
56static int io = 0x2E; /* Address used on Portwell Boards */ 56#define IO_DEFAULT 0x2E /* Address used on Portwell Boards */
57
58static int io = IO_DEFAULT;
57 59
58static int timeout = DEFAULT_TIMEOUT; /* timeout value */ 60static int timeout = DEFAULT_TIMEOUT; /* timeout value */
59static unsigned long timer_enabled; /* is the timer enabled? */ 61static unsigned long timer_enabled; /* is the timer enabled? */
@@ -583,12 +585,13 @@ MODULE_LICENSE("GPL");
583MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR); 585MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
584 586
585module_param(io, int, 0); 587module_param(io, int, 0);
586MODULE_PARM_DESC(io, MODNAME " I/O port (default: " __MODULE_STRING(io) ")."); 588MODULE_PARM_DESC(io, MODNAME " I/O port (default: "
589 __MODULE_STRING(IO_DEFAULT) ").");
587 590
588module_param(timeout, int, 0); 591module_param(timeout, int, 0);
589MODULE_PARM_DESC(timeout, 592MODULE_PARM_DESC(timeout,
590 "Watchdog timeout in minutes (default=" 593 "Watchdog timeout in minutes (default="
591 __MODULE_STRING(timeout) ")."); 594 __MODULE_STRING(DEFAULT_TIMEOUT) ").");
592 595
593module_param(nowayout, int, 0); 596module_param(nowayout, int, 0);
594MODULE_PARM_DESC(nowayout, 597MODULE_PARM_DESC(nowayout,
diff --git a/drivers/watchdog/pnx833x_wdt.c b/drivers/watchdog/pnx833x_wdt.c
index 09102f09e681..a7b5ad2a98bd 100644
--- a/drivers/watchdog/pnx833x_wdt.c
+++ b/drivers/watchdog/pnx833x_wdt.c
@@ -33,6 +33,8 @@
33#define PFX "pnx833x: " 33#define PFX "pnx833x: "
34#define WATCHDOG_TIMEOUT 30 /* 30 sec Maximum timeout */ 34#define WATCHDOG_TIMEOUT 30 /* 30 sec Maximum timeout */
35#define WATCHDOG_COUNT_FREQUENCY 68000000U /* Watchdog counts at 68MHZ. */ 35#define WATCHDOG_COUNT_FREQUENCY 68000000U /* Watchdog counts at 68MHZ. */
36#define PNX_WATCHDOG_TIMEOUT (WATCHDOG_TIMEOUT * WATCHDOG_COUNT_FREQUENCY)
37#define PNX_TIMEOUT_VALUE 2040000000U
36 38
37/** CONFIG block */ 39/** CONFIG block */
38#define PNX833X_CONFIG (0x07000U) 40#define PNX833X_CONFIG (0x07000U)
@@ -47,20 +49,21 @@
47static int pnx833x_wdt_alive; 49static int pnx833x_wdt_alive;
48 50
49/* Set default timeout in MHZ.*/ 51/* Set default timeout in MHZ.*/
50static int pnx833x_wdt_timeout = (WATCHDOG_TIMEOUT * WATCHDOG_COUNT_FREQUENCY); 52static int pnx833x_wdt_timeout = PNX_WATCHDOG_TIMEOUT;
51module_param(pnx833x_wdt_timeout, int, 0); 53module_param(pnx833x_wdt_timeout, int, 0);
52MODULE_PARM_DESC(timeout, "Watchdog timeout in Mhz. (68Mhz clock), default=" 54MODULE_PARM_DESC(timeout, "Watchdog timeout in Mhz. (68Mhz clock), default="
53 __MODULE_STRING(pnx833x_wdt_timeout) "(30 seconds)."); 55 __MODULE_STRING(PNX_TIMEOUT_VALUE) "(30 seconds).");
54 56
55static int nowayout = WATCHDOG_NOWAYOUT; 57static int nowayout = WATCHDOG_NOWAYOUT;
56module_param(nowayout, int, 0); 58module_param(nowayout, int, 0);
57MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" 59MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
58 __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); 60 __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
59 61
60static int start_enabled = 1; 62#define START_DEFAULT 1
63static int start_enabled = START_DEFAULT;
61module_param(start_enabled, int, 0); 64module_param(start_enabled, int, 0);
62MODULE_PARM_DESC(start_enabled, "Watchdog is started on module insertion " 65MODULE_PARM_DESC(start_enabled, "Watchdog is started on module insertion "
63 "(default=" __MODULE_STRING(start_enabled) ")"); 66 "(default=" __MODULE_STRING(START_DEFAULT) ")");
64 67
65static void pnx833x_wdt_start(void) 68static void pnx833x_wdt_start(void)
66{ 69{
diff --git a/drivers/watchdog/s3c2410_wdt.c b/drivers/watchdog/s3c2410_wdt.c
index e4cebef55177..300932580ded 100644
--- a/drivers/watchdog/s3c2410_wdt.c
+++ b/drivers/watchdog/s3c2410_wdt.c
@@ -63,7 +63,7 @@ module_param(nowayout, int, 0);
63module_param(soft_noboot, int, 0); 63module_param(soft_noboot, int, 0);
64module_param(debug, int, 0); 64module_param(debug, int, 0);
65 65
66MODULE_PARM_DESC(tmr_margin, "Watchdog tmr_margin in seconds. default=" 66MODULE_PARM_DESC(tmr_margin, "Watchdog tmr_margin in seconds. (default="
67 __MODULE_STRING(CONFIG_S3C2410_WATCHDOG_DEFAULT_TIME) ")"); 67 __MODULE_STRING(CONFIG_S3C2410_WATCHDOG_DEFAULT_TIME) ")");
68MODULE_PARM_DESC(tmr_atboot, 68MODULE_PARM_DESC(tmr_atboot,
69 "Watchdog is started at boot time if set to 1, default=" 69 "Watchdog is started at boot time if set to 1, default="
@@ -71,8 +71,8 @@ MODULE_PARM_DESC(tmr_atboot,
71MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" 71MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
72 __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); 72 __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
73MODULE_PARM_DESC(soft_noboot, "Watchdog action, set to 1 to ignore reboots, " 73MODULE_PARM_DESC(soft_noboot, "Watchdog action, set to 1 to ignore reboots, "
74 "0 to reboot (default depends on ONLY_TESTING)"); 74 "0 to reboot (default 0)");
75MODULE_PARM_DESC(debug, "Watchdog debug, set to >1 for debug, (default 0)"); 75MODULE_PARM_DESC(debug, "Watchdog debug, set to >1 for debug (default 0)");
76 76
77static unsigned long open_lock; 77static unsigned long open_lock;
78static struct device *wdt_dev; /* platform device attached to */ 78static struct device *wdt_dev; /* platform device attached to */
@@ -426,8 +426,7 @@ static int __devinit s3c2410wdt_probe(struct platform_device *pdev)
426 wdt_mem = request_mem_region(res->start, size, pdev->name); 426 wdt_mem = request_mem_region(res->start, size, pdev->name);
427 if (wdt_mem == NULL) { 427 if (wdt_mem == NULL) {
428 dev_err(dev, "failed to get memory region\n"); 428 dev_err(dev, "failed to get memory region\n");
429 ret = -ENOENT; 429 return -EBUSY;
430 goto err_req;
431 } 430 }
432 431
433 wdt_base = ioremap(res->start, size); 432 wdt_base = ioremap(res->start, size);
diff --git a/drivers/watchdog/shwdt.c b/drivers/watchdog/shwdt.c
index a03f84e5ee1f..6fc74065abee 100644
--- a/drivers/watchdog/shwdt.c
+++ b/drivers/watchdog/shwdt.c
@@ -496,7 +496,7 @@ MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
496module_param(clock_division_ratio, int, 0); 496module_param(clock_division_ratio, int, 0);
497MODULE_PARM_DESC(clock_division_ratio, 497MODULE_PARM_DESC(clock_division_ratio,
498 "Clock division ratio. Valid ranges are from 0x5 (1.31ms) " 498 "Clock division ratio. Valid ranges are from 0x5 (1.31ms) "
499 "to 0x7 (5.25ms). (default=" __MODULE_STRING(clock_division_ratio) ")"); 499 "to 0x7 (5.25ms). (default=" __MODULE_STRING(WTCSR_CKS_4096) ")");
500 500
501module_param(heartbeat, int, 0); 501module_param(heartbeat, int, 0);
502MODULE_PARM_DESC(heartbeat, 502MODULE_PARM_DESC(heartbeat,
diff --git a/drivers/watchdog/twl4030_wdt.c b/drivers/watchdog/twl4030_wdt.c
index dcabe77ad141..b5045ca7e61c 100644
--- a/drivers/watchdog/twl4030_wdt.c
+++ b/drivers/watchdog/twl4030_wdt.c
@@ -190,6 +190,8 @@ static int __devinit twl4030_wdt_probe(struct platform_device *pdev)
190 190
191 twl4030_wdt_dev = pdev; 191 twl4030_wdt_dev = pdev;
192 192
193 twl4030_wdt_disable(wdt);
194
193 ret = misc_register(&wdt->miscdev); 195 ret = misc_register(&wdt->miscdev);
194 if (ret) { 196 if (ret) {
195 dev_err(wdt->miscdev.parent, 197 dev_err(wdt->miscdev.parent,
diff --git a/drivers/watchdog/wdt.c b/drivers/watchdog/wdt.c
index bfda2e99dd89..552a4381e78f 100644
--- a/drivers/watchdog/wdt.c
+++ b/drivers/watchdog/wdt.c
@@ -91,7 +91,7 @@ MODULE_PARM_DESC(tachometer,
91static int type = 500; 91static int type = 500;
92module_param(type, int, 0); 92module_param(type, int, 0);
93MODULE_PARM_DESC(type, 93MODULE_PARM_DESC(type,
94 "WDT501-P Card type (500 or 501 , default=500)"); 94 "WDT501-P Card type (500 or 501, default=500)");
95 95
96/* 96/*
97 * Programming support 97 * Programming support
diff --git a/drivers/watchdog/wdt977.c b/drivers/watchdog/wdt977.c
index 90ef70eb47d7..5c2521fc836c 100644
--- a/drivers/watchdog/wdt977.c
+++ b/drivers/watchdog/wdt977.c
@@ -63,7 +63,7 @@ static char expect_close;
63static DEFINE_SPINLOCK(spinlock); 63static DEFINE_SPINLOCK(spinlock);
64 64
65module_param(timeout, int, 0); 65module_param(timeout, int, 0);
66MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds (60..15300), default=" 66MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds (60..15300, default="
67 __MODULE_STRING(DEFAULT_TIMEOUT) ")"); 67 __MODULE_STRING(DEFAULT_TIMEOUT) ")");
68module_param(testmode, int, 0); 68module_param(testmode, int, 0);
69MODULE_PARM_DESC(testmode, "Watchdog testmode (1 = no reboot), default=0"); 69MODULE_PARM_DESC(testmode, "Watchdog testmode (1 = no reboot), default=0");
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 8943b8ccee1a..07e857b0de13 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -185,6 +185,7 @@ static void shutdown_handler(struct xenbus_watch *watch,
185 kfree(str); 185 kfree(str);
186} 186}
187 187
188#ifdef CONFIG_MAGIC_SYSRQ
188static void sysrq_handler(struct xenbus_watch *watch, const char **vec, 189static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
189 unsigned int len) 190 unsigned int len)
190{ 191{
@@ -214,15 +215,16 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
214 handle_sysrq(sysrq_key, NULL); 215 handle_sysrq(sysrq_key, NULL);
215} 216}
216 217
217static struct xenbus_watch shutdown_watch = {
218 .node = "control/shutdown",
219 .callback = shutdown_handler
220};
221
222static struct xenbus_watch sysrq_watch = { 218static struct xenbus_watch sysrq_watch = {
223 .node = "control/sysrq", 219 .node = "control/sysrq",
224 .callback = sysrq_handler 220 .callback = sysrq_handler
225}; 221};
222#endif
223
224static struct xenbus_watch shutdown_watch = {
225 .node = "control/shutdown",
226 .callback = shutdown_handler
227};
226 228
227static int setup_shutdown_watcher(void) 229static int setup_shutdown_watcher(void)
228{ 230{
@@ -234,11 +236,13 @@ static int setup_shutdown_watcher(void)
234 return err; 236 return err;
235 } 237 }
236 238
239#ifdef CONFIG_MAGIC_SYSRQ
237 err = register_xenbus_watch(&sysrq_watch); 240 err = register_xenbus_watch(&sysrq_watch);
238 if (err) { 241 if (err) {
239 printk(KERN_ERR "Failed to set sysrq watcher\n"); 242 printk(KERN_ERR "Failed to set sysrq watcher\n");
240 return err; 243 return err;
241 } 244 }
245#endif
242 246
243 return 0; 247 return 0;
244} 248}
diff --git a/fs/exec.c b/fs/exec.c
index e6e94c626c2c..9badbc0bfb1d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -242,9 +242,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
242 * use STACK_TOP because that can depend on attributes which aren't 242 * use STACK_TOP because that can depend on attributes which aren't
243 * configured yet. 243 * configured yet.
244 */ 244 */
245 BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
245 vma->vm_end = STACK_TOP_MAX; 246 vma->vm_end = STACK_TOP_MAX;
246 vma->vm_start = vma->vm_end - PAGE_SIZE; 247 vma->vm_start = vma->vm_end - PAGE_SIZE;
247 vma->vm_flags = VM_STACK_FLAGS; 248 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
248 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 249 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
249 INIT_LIST_HEAD(&vma->anon_vma_chain); 250 INIT_LIST_HEAD(&vma->anon_vma_chain);
250 err = insert_vm_struct(mm, vma); 251 err = insert_vm_struct(mm, vma);
@@ -616,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
616 else if (executable_stack == EXSTACK_DISABLE_X) 617 else if (executable_stack == EXSTACK_DISABLE_X)
617 vm_flags &= ~VM_EXEC; 618 vm_flags &= ~VM_EXEC;
618 vm_flags |= mm->def_flags; 619 vm_flags |= mm->def_flags;
620 vm_flags |= VM_STACK_INCOMPLETE_SETUP;
619 621
620 ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, 622 ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
621 vm_flags); 623 vm_flags);
@@ -630,6 +632,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
630 goto out_unlock; 632 goto out_unlock;
631 } 633 }
632 634
635 /* mprotect_fixup is overkill to remove the temporary stack flags */
636 vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
637
633 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ 638 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
634 stack_size = vma->vm_end - vma->vm_start; 639 stack_size = vma->vm_end - vma->vm_start;
635 /* 640 /*
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 113f0a1e565d..ae8200f84e39 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -242,9 +242,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
242 while (*fclus < cluster) { 242 while (*fclus < cluster) {
243 /* prevent the infinite loop of cluster chain */ 243 /* prevent the infinite loop of cluster chain */
244 if (*fclus > limit) { 244 if (*fclus > limit) {
245 fat_fs_error(sb, "%s: detected the cluster chain loop" 245 fat_fs_error_ratelimit(sb,
246 " (i_pos %lld)", __func__, 246 "%s: detected the cluster chain loop"
247 MSDOS_I(inode)->i_pos); 247 " (i_pos %lld)", __func__,
248 MSDOS_I(inode)->i_pos);
248 nr = -EIO; 249 nr = -EIO;
249 goto out; 250 goto out;
250 } 251 }
@@ -253,9 +254,9 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
253 if (nr < 0) 254 if (nr < 0)
254 goto out; 255 goto out;
255 else if (nr == FAT_ENT_FREE) { 256 else if (nr == FAT_ENT_FREE) {
256 fat_fs_error(sb, "%s: invalid cluster chain" 257 fat_fs_error_ratelimit(sb, "%s: invalid cluster chain"
257 " (i_pos %lld)", __func__, 258 " (i_pos %lld)", __func__,
258 MSDOS_I(inode)->i_pos); 259 MSDOS_I(inode)->i_pos);
259 nr = -EIO; 260 nr = -EIO;
260 goto out; 261 goto out;
261 } else if (nr == FAT_ENT_EOF) { 262 } else if (nr == FAT_ENT_EOF) {
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index eb821ee1a333..53dba57b49a1 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -6,6 +6,7 @@
6#include <linux/nls.h> 6#include <linux/nls.h>
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/mutex.h> 8#include <linux/mutex.h>
9#include <linux/ratelimit.h>
9#include <linux/msdos_fs.h> 10#include <linux/msdos_fs.h>
10 11
11/* 12/*
@@ -82,6 +83,8 @@ struct msdos_sb_info {
82 struct fatent_operations *fatent_ops; 83 struct fatent_operations *fatent_ops;
83 struct inode *fat_inode; 84 struct inode *fat_inode;
84 85
86 struct ratelimit_state ratelimit;
87
85 spinlock_t inode_hash_lock; 88 spinlock_t inode_hash_lock;
86 struct hlist_head inode_hashtable[FAT_HASH_SIZE]; 89 struct hlist_head inode_hashtable[FAT_HASH_SIZE];
87}; 90};
@@ -322,8 +325,13 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
322extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, 325extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
323 struct inode *i2); 326 struct inode *i2);
324/* fat/misc.c */ 327/* fat/misc.c */
325extern void fat_fs_error(struct super_block *s, const char *fmt, ...) 328extern void
326 __attribute__ ((format (printf, 2, 3))) __cold; 329__fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
330 __attribute__ ((format (printf, 3, 4))) __cold;
331#define fat_fs_error(s, fmt, args...) \
332 __fat_fs_error(s, 1, fmt , ## args)
333#define fat_fs_error_ratelimit(s, fmt, args...) \
334 __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args)
327extern int fat_clusters_flush(struct super_block *sb); 335extern int fat_clusters_flush(struct super_block *sb);
328extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); 336extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
329extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, 337extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index c611818893b2..ed33904926ee 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1250,6 +1250,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1250 sb->s_op = &fat_sops; 1250 sb->s_op = &fat_sops;
1251 sb->s_export_op = &fat_export_ops; 1251 sb->s_export_op = &fat_export_ops;
1252 sbi->dir_ops = fs_dir_inode_ops; 1252 sbi->dir_ops = fs_dir_inode_ops;
1253 ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
1254 DEFAULT_RATELIMIT_BURST);
1253 1255
1254 error = parse_options(data, isvfat, silent, &debug, &sbi->options); 1256 error = parse_options(data, isvfat, silent, &debug, &sbi->options);
1255 if (error) 1257 if (error)
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index d3da05f26465..1fa23f6ffba5 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -20,27 +20,29 @@
20 * In case the file system is remounted read-only, it can be made writable 20 * In case the file system is remounted read-only, it can be made writable
21 * again by remounting it. 21 * again by remounting it.
22 */ 22 */
23void fat_fs_error(struct super_block *s, const char *fmt, ...) 23void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
24{ 24{
25 struct fat_mount_options *opts = &MSDOS_SB(s)->options; 25 struct fat_mount_options *opts = &MSDOS_SB(s)->options;
26 va_list args; 26 va_list args;
27 27
28 printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id); 28 if (report) {
29 printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
29 30
30 printk(KERN_ERR " "); 31 printk(KERN_ERR " ");
31 va_start(args, fmt); 32 va_start(args, fmt);
32 vprintk(fmt, args); 33 vprintk(fmt, args);
33 va_end(args); 34 va_end(args);
34 printk("\n"); 35 printk("\n");
36 }
35 37
36 if (opts->errors == FAT_ERRORS_PANIC) 38 if (opts->errors == FAT_ERRORS_PANIC)
37 panic(" FAT fs panic from previous error\n"); 39 panic("FAT: fs panic from previous error\n");
38 else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) { 40 else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
39 s->s_flags |= MS_RDONLY; 41 s->s_flags |= MS_RDONLY;
40 printk(KERN_ERR " File system has been set read-only\n"); 42 printk(KERN_ERR "FAT: Filesystem has been set read-only\n");
41 } 43 }
42} 44}
43EXPORT_SYMBOL_GPL(fat_fs_error); 45EXPORT_SYMBOL_GPL(__fat_fs_error);
44 46
45/* Flushes the number of free clusters on FAT32 */ 47/* Flushes the number of free clusters on FAT32 */
46/* XXX: Need to write one per FSINFO block. Currently only writes 1 */ 48/* XXX: Need to write one per FSINFO block. Currently only writes 1 */
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5c4161f1fd9a..ea8592b90696 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -409,11 +409,11 @@ static void inode_wait_for_writeback(struct inode *inode)
409 wait_queue_head_t *wqh; 409 wait_queue_head_t *wqh;
410 410
411 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 411 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
412 do { 412 while (inode->i_state & I_SYNC) {
413 spin_unlock(&inode_lock); 413 spin_unlock(&inode_lock);
414 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 414 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
415 spin_lock(&inode_lock); 415 spin_lock(&inode_lock);
416 } while (inode->i_state & I_SYNC); 416 }
417} 417}
418 418
419/* 419/*
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 9fb76b0a0485..48171f4c943d 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -236,10 +236,14 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
236 void *buffer, size_t size, int xtype) 236 void *buffer, size_t size, int xtype)
237{ 237{
238 struct inode *inode = dentry->d_inode; 238 struct inode *inode = dentry->d_inode;
239 struct gfs2_sbd *sdp = GFS2_SB(inode);
239 struct posix_acl *acl; 240 struct posix_acl *acl;
240 int type; 241 int type;
241 int error; 242 int error;
242 243
244 if (!sdp->sd_args.ar_posix_acl)
245 return -EOPNOTSUPP;
246
243 type = gfs2_acl_type(name); 247 type = gfs2_acl_type(name);
244 if (type < 0) 248 if (type < 0)
245 return type; 249 return type;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e6dd2aec6f82..b20bfcc9fa2d 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -218,6 +218,11 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
218 if (error) 218 if (error)
219 goto out_drop_write; 219 goto out_drop_write;
220 220
221 error = -EACCES;
222 if (!is_owner_or_cap(inode))
223 goto out;
224
225 error = 0;
221 flags = ip->i_diskflags; 226 flags = ip->i_diskflags;
222 new_flags = (flags & ~mask) | (reqflags & mask); 227 new_flags = (flags & ~mask) | (reqflags & mask);
223 if ((new_flags ^ flags) == 0) 228 if ((new_flags ^ flags) == 0)
@@ -275,8 +280,10 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
275{ 280{
276 struct inode *inode = filp->f_path.dentry->d_inode; 281 struct inode *inode = filp->f_path.dentry->d_inode;
277 u32 fsflags, gfsflags; 282 u32 fsflags, gfsflags;
283
278 if (get_user(fsflags, ptr)) 284 if (get_user(fsflags, ptr))
279 return -EFAULT; 285 return -EFAULT;
286
280 gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); 287 gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
281 if (!S_ISDIR(inode->i_mode)) { 288 if (!S_ISDIR(inode->i_mode)) {
282 if (gfsflags & GFS2_DIF_INHERIT_JDATA) 289 if (gfsflags & GFS2_DIF_INHERIT_JDATA)
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 51d8061fa07a..b5612cbb62a5 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -242,34 +242,38 @@ fail:
242} 242}
243 243
244/** 244/**
245 * gfs2_unlinked_inode_lookup - Lookup an unlinked inode for reclamation 245 * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
246 * and try to reclaim it by doing iput.
247 *
248 * This function assumes no rgrp locks are currently held.
249 *
246 * @sb: The super block 250 * @sb: The super block
247 * no_addr: The inode number 251 * no_addr: The inode number
248 * @@inode: A pointer to the inode found, if any
249 * 252 *
250 * Returns: 0 and *inode if no errors occurred. If an error occurs,
251 * the resulting *inode may or may not be NULL.
252 */ 253 */
253 254
254int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr, 255void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
255 struct inode **inode)
256{ 256{
257 struct gfs2_sbd *sdp; 257 struct gfs2_sbd *sdp;
258 struct gfs2_inode *ip; 258 struct gfs2_inode *ip;
259 struct gfs2_glock *io_gl; 259 struct gfs2_glock *io_gl;
260 int error; 260 int error;
261 struct gfs2_holder gh; 261 struct gfs2_holder gh;
262 struct inode *inode;
262 263
263 *inode = gfs2_iget_skip(sb, no_addr); 264 inode = gfs2_iget_skip(sb, no_addr);
264 265
265 if (!(*inode)) 266 if (!inode)
266 return -ENOBUFS; 267 return;
267 268
268 if (!((*inode)->i_state & I_NEW)) 269 /* If it's not a new inode, someone's using it, so leave it alone. */
269 return -ENOBUFS; 270 if (!(inode->i_state & I_NEW)) {
271 iput(inode);
272 return;
273 }
270 274
271 ip = GFS2_I(*inode); 275 ip = GFS2_I(inode);
272 sdp = GFS2_SB(*inode); 276 sdp = GFS2_SB(inode);
273 ip->i_no_formal_ino = -1; 277 ip->i_no_formal_ino = -1;
274 278
275 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); 279 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
@@ -284,15 +288,13 @@ int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
284 set_bit(GIF_INVALID, &ip->i_flags); 288 set_bit(GIF_INVALID, &ip->i_flags);
285 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT, 289 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
286 &ip->i_iopen_gh); 290 &ip->i_iopen_gh);
287 if (unlikely(error)) { 291 if (unlikely(error))
288 if (error == GLR_TRYFAILED)
289 error = 0;
290 goto fail_iopen; 292 goto fail_iopen;
291 } 293
292 ip->i_iopen_gh.gh_gl->gl_object = ip; 294 ip->i_iopen_gh.gh_gl->gl_object = ip;
293 gfs2_glock_put(io_gl); 295 gfs2_glock_put(io_gl);
294 296
295 (*inode)->i_mode = DT2IF(DT_UNKNOWN); 297 inode->i_mode = DT2IF(DT_UNKNOWN);
296 298
297 /* 299 /*
298 * We must read the inode in order to work out its type in 300 * We must read the inode in order to work out its type in
@@ -303,16 +305,17 @@ int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
303 */ 305 */
304 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY, 306 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
305 &gh); 307 &gh);
306 if (unlikely(error)) { 308 if (unlikely(error))
307 if (error == GLR_TRYFAILED)
308 error = 0;
309 goto fail_glock; 309 goto fail_glock;
310 } 310
311 /* Inode is now uptodate */ 311 /* Inode is now uptodate */
312 gfs2_glock_dq_uninit(&gh); 312 gfs2_glock_dq_uninit(&gh);
313 gfs2_set_iop(*inode); 313 gfs2_set_iop(inode);
314
315 /* The iput will cause it to be deleted. */
316 iput(inode);
317 return;
314 318
315 return 0;
316fail_glock: 319fail_glock:
317 gfs2_glock_dq(&ip->i_iopen_gh); 320 gfs2_glock_dq(&ip->i_iopen_gh);
318fail_iopen: 321fail_iopen:
@@ -321,7 +324,8 @@ fail_put:
321 ip->i_gl->gl_object = NULL; 324 ip->i_gl->gl_object = NULL;
322 gfs2_glock_put(ip->i_gl); 325 gfs2_glock_put(ip->i_gl);
323fail: 326fail:
324 return error; 327 iget_failed(inode);
328 return;
325} 329}
326 330
327static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) 331static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index e161461d4c57..300ada3f21de 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -84,8 +84,7 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
84extern void gfs2_set_iop(struct inode *inode); 84extern void gfs2_set_iop(struct inode *inode);
85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
86 u64 no_addr, u64 no_formal_ino); 86 u64 no_addr, u64 no_formal_ino);
87extern int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr, 87extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
88 struct inode **inode);
89extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); 88extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
90 89
91extern int gfs2_inode_refresh(struct gfs2_inode *ip); 90extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index b593f0e28f25..6a857e24f947 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -696,7 +696,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
696 * 696 *
697 */ 697 */
698 698
699void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) 699void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
700{ 700{
701 struct gfs2_ail *ai; 701 struct gfs2_ail *ai;
702 702
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index eb570b4ad443..0d007f920234 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -47,28 +47,21 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
47 sdp->sd_log_head = sdp->sd_log_tail = value; 47 sdp->sd_log_head = sdp->sd_log_tail = value;
48} 48}
49 49
50unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, 50extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
51 unsigned int ssize); 51 unsigned int ssize);
52 52
53int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); 53extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
54void gfs2_log_incr_head(struct gfs2_sbd *sdp); 54extern void gfs2_log_incr_head(struct gfs2_sbd *sdp);
55 55
56struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); 56extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
57struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, 57extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
58 struct buffer_head *real); 58 struct buffer_head *real);
59void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); 59extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
60extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
61extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
60 62
61static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl) 63extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
62{ 64extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
63 if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags)) 65extern int gfs2_logd(void *data);
64 __gfs2_log_flush(sbd, gl);
65}
66
67void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
68void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
69
70void gfs2_log_shutdown(struct gfs2_sbd *sdp);
71void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
72int gfs2_logd(void *data);
73 66
74#endif /* __LOG_DOT_H__ */ 67#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 117fa4171f62..171a744f8e45 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1192,7 +1192,6 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
1192{ 1192{
1193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1194 struct gfs2_alloc *al = ip->i_alloc; 1194 struct gfs2_alloc *al = ip->i_alloc;
1195 struct inode *inode;
1196 int error = 0; 1195 int error = 0;
1197 u64 last_unlinked = NO_BLOCK, unlinked; 1196 u64 last_unlinked = NO_BLOCK, unlinked;
1198 1197
@@ -1210,22 +1209,27 @@ try_again:
1210 if (error) 1209 if (error)
1211 return error; 1210 return error;
1212 1211
1212 /* Find an rgrp suitable for allocation. If it encounters any unlinked
1213 dinodes along the way, error will equal -EAGAIN and unlinked will
1214 contains it block address. We then need to look up that inode and
1215 try to free it, and try the allocation again. */
1213 error = get_local_rgrp(ip, &unlinked, &last_unlinked); 1216 error = get_local_rgrp(ip, &unlinked, &last_unlinked);
1214 if (error) { 1217 if (error) {
1215 if (ip != GFS2_I(sdp->sd_rindex)) 1218 if (ip != GFS2_I(sdp->sd_rindex))
1216 gfs2_glock_dq_uninit(&al->al_ri_gh); 1219 gfs2_glock_dq_uninit(&al->al_ri_gh);
1217 if (error != -EAGAIN) 1220 if (error != -EAGAIN)
1218 return error; 1221 return error;
1219 error = gfs2_unlinked_inode_lookup(ip->i_inode.i_sb, 1222
1220 unlinked, &inode); 1223 gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
1221 if (inode) 1224 /* regardless of whether or not gfs2_process_unlinked_inode
1222 iput(inode); 1225 was successful, we don't want to repeat it again. */
1226 last_unlinked = unlinked;
1223 gfs2_log_flush(sdp, NULL); 1227 gfs2_log_flush(sdp, NULL);
1224 if (error == GLR_TRYFAILED) 1228 error = 0;
1225 error = 0; 1229
1226 goto try_again; 1230 goto try_again;
1227 } 1231 }
1228 1232 /* no error, so we have the rgrp set in the inode's allocation. */
1229 al->al_file = file; 1233 al->al_file = file;
1230 al->al_line = line; 1234 al->al_line = line;
1231 1235
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2f8b1157daa2..04214fc5c304 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1060,7 +1060,7 @@ static int nfs_parse_mount_options(char *raw,
1060 goto out_nomem; 1060 goto out_nomem;
1061 rc = strict_strtoul(string, 10, &option); 1061 rc = strict_strtoul(string, 10, &option);
1062 kfree(string); 1062 kfree(string);
1063 if (rc != 0 || option > USHORT_MAX) 1063 if (rc != 0 || option > USHRT_MAX)
1064 goto out_invalid_value; 1064 goto out_invalid_value;
1065 mnt->nfs_server.port = option; 1065 mnt->nfs_server.port = option;
1066 break; 1066 break;
@@ -1181,7 +1181,7 @@ static int nfs_parse_mount_options(char *raw,
1181 goto out_nomem; 1181 goto out_nomem;
1182 rc = strict_strtoul(string, 10, &option); 1182 rc = strict_strtoul(string, 10, &option);
1183 kfree(string); 1183 kfree(string);
1184 if (rc != 0 || option > USHORT_MAX) 1184 if (rc != 0 || option > USHRT_MAX)
1185 goto out_invalid_value; 1185 goto out_invalid_value;
1186 mnt->mount_server.port = option; 1186 mnt->mount_server.port = option;
1187 break; 1187 break;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index bc3194ea01f5..508941c23af7 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -998,7 +998,7 @@ static ssize_t __write_ports_addxprt(char *buf)
998 if (sscanf(buf, "%15s %4u", transport, &port) != 2) 998 if (sscanf(buf, "%15s %4u", transport, &port) != 2)
999 return -EINVAL; 999 return -EINVAL;
1000 1000
1001 if (port < 1 || port > USHORT_MAX) 1001 if (port < 1 || port > USHRT_MAX)
1002 return -EINVAL; 1002 return -EINVAL;
1003 1003
1004 err = nfsd_create_serv(); 1004 err = nfsd_create_serv();
@@ -1040,7 +1040,7 @@ static ssize_t __write_ports_delxprt(char *buf)
1040 if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2) 1040 if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
1041 return -EINVAL; 1041 return -EINVAL;
1042 1042
1043 if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL) 1043 if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
1044 return -EINVAL; 1044 return -EINVAL;
1045 1045
1046 xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port); 1046 xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 8804f093ba75..a1924a0d2ab0 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -98,9 +98,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
98 * the page at all. For a more detailed explanation see ntfs_truncate() in 98 * the page at all. For a more detailed explanation see ntfs_truncate() in
99 * fs/ntfs/inode.c. 99 * fs/ntfs/inode.c.
100 * 100 *
101 * @cached_page and @lru_pvec are just optimizations for dealing with multiple
102 * pages.
103 *
104 * Return 0 on success and -errno on error. In the case that an error is 101 * Return 0 on success and -errno on error. In the case that an error is
105 * encountered it is possible that the initialized size will already have been 102 * encountered it is possible that the initialized size will already have been
106 * incremented some way towards @new_init_size but it is guaranteed that if 103 * incremented some way towards @new_init_size but it is guaranteed that if
@@ -110,8 +107,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
110 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be 107 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
111 * held by the caller. 108 * held by the caller.
112 */ 109 */
113static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size, 110static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
114 struct page **cached_page, struct pagevec *lru_pvec)
115{ 111{
116 s64 old_init_size; 112 s64 old_init_size;
117 loff_t old_i_size; 113 loff_t old_i_size;
@@ -403,18 +399,13 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
403 * Obtain @nr_pages locked page cache pages from the mapping @mapping and 399 * Obtain @nr_pages locked page cache pages from the mapping @mapping and
404 * starting at index @index. 400 * starting at index @index.
405 * 401 *
406 * If a page is newly created, increment its refcount and add it to the 402 * If a page is newly created, add it to lru list
407 * caller's lru-buffering pagevec @lru_pvec.
408 *
409 * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
410 * are obtained at once instead of just one page and that 0 is returned on
411 * success and -errno on error.
412 * 403 *
413 * Note, the page locks are obtained in ascending page index order. 404 * Note, the page locks are obtained in ascending page index order.
414 */ 405 */
415static inline int __ntfs_grab_cache_pages(struct address_space *mapping, 406static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
416 pgoff_t index, const unsigned nr_pages, struct page **pages, 407 pgoff_t index, const unsigned nr_pages, struct page **pages,
417 struct page **cached_page, struct pagevec *lru_pvec) 408 struct page **cached_page)
418{ 409{
419 int err, nr; 410 int err, nr;
420 411
@@ -430,7 +421,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
430 goto err_out; 421 goto err_out;
431 } 422 }
432 } 423 }
433 err = add_to_page_cache(*cached_page, mapping, index, 424 err = add_to_page_cache_lru(*cached_page, mapping, index,
434 GFP_KERNEL); 425 GFP_KERNEL);
435 if (unlikely(err)) { 426 if (unlikely(err)) {
436 if (err == -EEXIST) 427 if (err == -EEXIST)
@@ -438,9 +429,6 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
438 goto err_out; 429 goto err_out;
439 } 430 }
440 pages[nr] = *cached_page; 431 pages[nr] = *cached_page;
441 page_cache_get(*cached_page);
442 if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
443 __pagevec_lru_add_file(lru_pvec);
444 *cached_page = NULL; 432 *cached_page = NULL;
445 } 433 }
446 index++; 434 index++;
@@ -1800,7 +1788,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1800 ssize_t status, written; 1788 ssize_t status, written;
1801 unsigned nr_pages; 1789 unsigned nr_pages;
1802 int err; 1790 int err;
1803 struct pagevec lru_pvec;
1804 1791
1805 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " 1792 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
1806 "pos 0x%llx, count 0x%lx.", 1793 "pos 0x%llx, count 0x%lx.",
@@ -1912,7 +1899,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1912 } 1899 }
1913 } 1900 }
1914 } 1901 }
1915 pagevec_init(&lru_pvec, 0);
1916 written = 0; 1902 written = 0;
1917 /* 1903 /*
1918 * If the write starts beyond the initialized size, extend it up to the 1904 * If the write starts beyond the initialized size, extend it up to the
@@ -1925,8 +1911,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1925 ll = ni->initialized_size; 1911 ll = ni->initialized_size;
1926 read_unlock_irqrestore(&ni->size_lock, flags); 1912 read_unlock_irqrestore(&ni->size_lock, flags);
1927 if (pos > ll) { 1913 if (pos > ll) {
1928 err = ntfs_attr_extend_initialized(ni, pos, &cached_page, 1914 err = ntfs_attr_extend_initialized(ni, pos);
1929 &lru_pvec);
1930 if (err < 0) { 1915 if (err < 0) {
1931 ntfs_error(vol->sb, "Cannot perform write to inode " 1916 ntfs_error(vol->sb, "Cannot perform write to inode "
1932 "0x%lx, attribute type 0x%x, because " 1917 "0x%lx, attribute type 0x%x, because "
@@ -2012,7 +1997,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
2012 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes); 1997 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
2013 /* Get and lock @do_pages starting at index @start_idx. */ 1998 /* Get and lock @do_pages starting at index @start_idx. */
2014 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, 1999 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
2015 pages, &cached_page, &lru_pvec); 2000 pages, &cached_page);
2016 if (unlikely(status)) 2001 if (unlikely(status))
2017 break; 2002 break;
2018 /* 2003 /*
@@ -2077,7 +2062,6 @@ err_out:
2077 *ppos = pos; 2062 *ppos = pos;
2078 if (cached_page) 2063 if (cached_page)
2079 page_cache_release(cached_page); 2064 page_cache_release(cached_page);
2080 pagevec_lru_add_file(&lru_pvec);
2081 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", 2065 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
2082 written ? "written" : "status", (unsigned long)written, 2066 written ? "written" : "status", (unsigned long)written,
2083 (long)status); 2067 (long)status);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index b7428c5d0d3b..ec6d12339593 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -403,7 +403,7 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
403 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no 403 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
404 * larger than 16 bits. 404 * larger than 16 bits.
405 */ 405 */
406 BUG_ON(ecc > USHORT_MAX); 406 BUG_ON(ecc > USHRT_MAX);
407 407
408 bc->bc_crc32e = cpu_to_le32(crc); 408 bc->bc_crc32e = cpu_to_le32(crc);
409 bc->bc_ecc = cpu_to_le16((u16)ecc); 409 bc->bc_ecc = cpu_to_le16((u16)ecc);
@@ -508,7 +508,7 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
508 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no 508 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
509 * larger than 16 bits. 509 * larger than 16 bits.
510 */ 510 */
511 BUG_ON(ecc > USHORT_MAX); 511 BUG_ON(ecc > USHRT_MAX);
512 512
513 bc->bc_crc32e = cpu_to_le32(crc); 513 bc->bc_crc32e = cpu_to_le32(crc);
514 bc->bc_ecc = cpu_to_le16((u16)ecc); 514 bc->bc_ecc = cpu_to_le16((u16)ecc);
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 3ceca05b668c..648c9d8f3357 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -26,6 +26,7 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/pagemap.h> 27#include <linux/pagemap.h>
28#include <linux/stringify.h> 28#include <linux/stringify.h>
29#include <linux/kernel.h>
29#include "ldm.h" 30#include "ldm.h"
30#include "check.h" 31#include "check.h"
31#include "msdos.h" 32#include "msdos.h"
@@ -77,17 +78,16 @@ static int ldm_parse_hexbyte (const u8 *src)
77 int h; 78 int h;
78 79
79 /* high part */ 80 /* high part */
80 if ((x = src[0] - '0') <= '9'-'0') h = x; 81 x = h = hex_to_bin(src[0]);
81 else if ((x = src[0] - 'a') <= 'f'-'a') h = x+10; 82 if (h < 0)
82 else if ((x = src[0] - 'A') <= 'F'-'A') h = x+10; 83 return -1;
83 else return -1;
84 h <<= 4;
85 84
86 /* low part */ 85 /* low part */
87 if ((x = src[1] - '0') <= '9'-'0') return h | x; 86 h = hex_to_bin(src[1]);
88 if ((x = src[1] - 'a') <= 'f'-'a') return h | (x+10); 87 if (h < 0)
89 if ((x = src[1] - 'A') <= 'F'-'A') return h | (x+10); 88 return -1;
90 return -1; 89
90 return (x << 4) + h;
91} 91}
92 92
93/** 93/**
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 47f5b145f56e..aea1d3f1ffb5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -634,6 +634,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
634 return err; 634 return err;
635} 635}
636 636
637#ifdef CONFIG_HUGETLB_PAGE
637static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) 638static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
638{ 639{
639 u64 pme = 0; 640 u64 pme = 0;
@@ -664,6 +665,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
664 665
665 return err; 666 return err;
666} 667}
668#endif /* HUGETLB_PAGE */
667 669
668/* 670/*
669 * /proc/pid/pagemap - an array mapping virtual pages to pfns 671 * /proc/pid/pagemap - an array mapping virtual pages to pfns
@@ -733,7 +735,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
733 735
734 pagemap_walk.pmd_entry = pagemap_pte_range; 736 pagemap_walk.pmd_entry = pagemap_pte_range;
735 pagemap_walk.pte_hole = pagemap_pte_hole; 737 pagemap_walk.pte_hole = pagemap_pte_hole;
738#ifdef CONFIG_HUGETLB_PAGE
736 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; 739 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
740#endif
737 pagemap_walk.mm = mm; 741 pagemap_walk.mm = mm;
738 pagemap_walk.private = &pm; 742 pagemap_walk.private = &pm;
739 743
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index 54350b59046b..00b2909bd469 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -15,7 +15,6 @@
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/net.h> 16#include <linux/net.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
19 18
20#include <asm/uaccess.h> 19#include <asm/uaccess.h>
21#include <asm/system.h> 20#include <asm/system.h>
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b4769e40e8bc..c8fb13f83b3f 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -77,6 +77,7 @@ xfs-y += xfs_alloc.o \
77 xfs_itable.o \ 77 xfs_itable.o \
78 xfs_dfrag.o \ 78 xfs_dfrag.o \
79 xfs_log.o \ 79 xfs_log.o \
80 xfs_log_cil.o \
80 xfs_log_recover.o \ 81 xfs_log_recover.o \
81 xfs_mount.o \ 82 xfs_mount.o \
82 xfs_mru_cache.o \ 83 xfs_mru_cache.o \
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index f01de3c55c43..649ade8ef598 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -37,6 +37,7 @@
37 37
38#include "xfs_sb.h" 38#include "xfs_sb.h"
39#include "xfs_inum.h" 39#include "xfs_inum.h"
40#include "xfs_log.h"
40#include "xfs_ag.h" 41#include "xfs_ag.h"
41#include "xfs_dmapi.h" 42#include "xfs_dmapi.h"
42#include "xfs_mount.h" 43#include "xfs_mount.h"
@@ -850,6 +851,12 @@ xfs_buf_lock_value(
850 * Note that this in no way locks the underlying pages, so it is only 851 * Note that this in no way locks the underlying pages, so it is only
851 * useful for synchronizing concurrent use of buffer objects, not for 852 * useful for synchronizing concurrent use of buffer objects, not for
852 * synchronizing independent access to the underlying pages. 853 * synchronizing independent access to the underlying pages.
854 *
855 * If we come across a stale, pinned, locked buffer, we know that we
856 * are being asked to lock a buffer that has been reallocated. Because
857 * it is pinned, we know that the log has not been pushed to disk and
858 * hence it will still be locked. Rather than sleeping until someone
859 * else pushes the log, push it ourselves before trying to get the lock.
853 */ 860 */
854void 861void
855xfs_buf_lock( 862xfs_buf_lock(
@@ -857,6 +864,8 @@ xfs_buf_lock(
857{ 864{
858 trace_xfs_buf_lock(bp, _RET_IP_); 865 trace_xfs_buf_lock(bp, _RET_IP_);
859 866
867 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
868 xfs_log_force(bp->b_mount, 0);
860 if (atomic_read(&bp->b_io_remaining)) 869 if (atomic_read(&bp->b_io_remaining))
861 blk_run_address_space(bp->b_target->bt_mapping); 870 blk_run_address_space(bp->b_target->bt_mapping);
862 down(&bp->b_sema); 871 down(&bp->b_sema);
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index e31bf21fe5d3..9ac8aea91529 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -19,6 +19,7 @@
19#include "xfs_dmapi.h" 19#include "xfs_dmapi.h"
20#include "xfs_sb.h" 20#include "xfs_sb.h"
21#include "xfs_inum.h" 21#include "xfs_inum.h"
22#include "xfs_log.h"
22#include "xfs_ag.h" 23#include "xfs_ag.h"
23#include "xfs_mount.h" 24#include "xfs_mount.h"
24#include "xfs_quota.h" 25#include "xfs_quota.h"
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index f24dbe5efde3..f2d1718c9165 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -119,6 +119,8 @@ mempool_t *xfs_ioend_pool;
119#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ 119#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */
120#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ 120#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */
121#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ 121#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */
122#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */
123#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */
122 124
123/* 125/*
124 * Table driven mount option parser. 126 * Table driven mount option parser.
@@ -374,6 +376,13 @@ xfs_parseargs(
374 mp->m_flags |= XFS_MOUNT_DMAPI; 376 mp->m_flags |= XFS_MOUNT_DMAPI;
375 } else if (!strcmp(this_char, MNTOPT_DMI)) { 377 } else if (!strcmp(this_char, MNTOPT_DMI)) {
376 mp->m_flags |= XFS_MOUNT_DMAPI; 378 mp->m_flags |= XFS_MOUNT_DMAPI;
379 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
380 mp->m_flags |= XFS_MOUNT_DELAYLOG;
381 cmn_err(CE_WARN,
382 "Enabling EXPERIMENTAL delayed logging feature "
383 "- use at your own risk.\n");
384 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
385 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
377 } else if (!strcmp(this_char, "ihashsize")) { 386 } else if (!strcmp(this_char, "ihashsize")) {
378 cmn_err(CE_WARN, 387 cmn_err(CE_WARN,
379 "XFS: ihashsize no longer used, option is deprecated."); 388 "XFS: ihashsize no longer used, option is deprecated.");
@@ -535,6 +544,7 @@ xfs_showargs(
535 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, 544 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
536 { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI }, 545 { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI },
537 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, 546 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
547 { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
538 { 0, NULL } 548 { 0, NULL }
539 }; 549 };
540 static struct proc_xfs_info xfs_info_unset[] = { 550 static struct proc_xfs_info xfs_info_unset[] = {
@@ -1755,7 +1765,7 @@ xfs_init_zones(void)
1755 * but it is much faster. 1765 * but it is much faster.
1756 */ 1766 */
1757 xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + 1767 xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
1758 (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / 1768 (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
1759 NBWORD) * sizeof(int))), "xfs_buf_item"); 1769 NBWORD) * sizeof(int))), "xfs_buf_item");
1760 if (!xfs_buf_item_zone) 1770 if (!xfs_buf_item_zone)
1761 goto out_destroy_trans_zone; 1771 goto out_destroy_trans_zone;
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 8a319cfd2901..ff6bc797baf2 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1059,83 +1059,112 @@ TRACE_EVENT(xfs_bunmap,
1059 1059
1060); 1060);
1061 1061
1062#define XFS_BUSY_SYNC \
1063 { 0, "async" }, \
1064 { 1, "sync" }
1065
1062TRACE_EVENT(xfs_alloc_busy, 1066TRACE_EVENT(xfs_alloc_busy,
1063 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, 1067 TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
1064 xfs_extlen_t len, int slot), 1068 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
1065 TP_ARGS(mp, agno, agbno, len, slot), 1069 TP_ARGS(trans, agno, agbno, len, sync),
1066 TP_STRUCT__entry( 1070 TP_STRUCT__entry(
1067 __field(dev_t, dev) 1071 __field(dev_t, dev)
1072 __field(struct xfs_trans *, tp)
1073 __field(int, tid)
1068 __field(xfs_agnumber_t, agno) 1074 __field(xfs_agnumber_t, agno)
1069 __field(xfs_agblock_t, agbno) 1075 __field(xfs_agblock_t, agbno)
1070 __field(xfs_extlen_t, len) 1076 __field(xfs_extlen_t, len)
1071 __field(int, slot) 1077 __field(int, sync)
1072 ), 1078 ),
1073 TP_fast_assign( 1079 TP_fast_assign(
1074 __entry->dev = mp->m_super->s_dev; 1080 __entry->dev = trans->t_mountp->m_super->s_dev;
1081 __entry->tp = trans;
1082 __entry->tid = trans->t_ticket->t_tid;
1075 __entry->agno = agno; 1083 __entry->agno = agno;
1076 __entry->agbno = agbno; 1084 __entry->agbno = agbno;
1077 __entry->len = len; 1085 __entry->len = len;
1078 __entry->slot = slot; 1086 __entry->sync = sync;
1079 ), 1087 ),
1080 TP_printk("dev %d:%d agno %u agbno %u len %u slot %d", 1088 TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
1081 MAJOR(__entry->dev), MINOR(__entry->dev), 1089 MAJOR(__entry->dev), MINOR(__entry->dev),
1090 __entry->tp,
1091 __entry->tid,
1082 __entry->agno, 1092 __entry->agno,
1083 __entry->agbno, 1093 __entry->agbno,
1084 __entry->len, 1094 __entry->len,
1085 __entry->slot) 1095 __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
1086 1096
1087); 1097);
1088 1098
1089#define XFS_BUSY_STATES \
1090 { 0, "found" }, \
1091 { 1, "missing" }
1092
1093TRACE_EVENT(xfs_alloc_unbusy, 1099TRACE_EVENT(xfs_alloc_unbusy,
1094 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, 1100 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1095 int slot, int found), 1101 xfs_agblock_t agbno, xfs_extlen_t len),
1096 TP_ARGS(mp, agno, slot, found), 1102 TP_ARGS(mp, agno, agbno, len),
1097 TP_STRUCT__entry( 1103 TP_STRUCT__entry(
1098 __field(dev_t, dev) 1104 __field(dev_t, dev)
1099 __field(xfs_agnumber_t, agno) 1105 __field(xfs_agnumber_t, agno)
1100 __field(int, slot) 1106 __field(xfs_agblock_t, agbno)
1101 __field(int, found) 1107 __field(xfs_extlen_t, len)
1102 ), 1108 ),
1103 TP_fast_assign( 1109 TP_fast_assign(
1104 __entry->dev = mp->m_super->s_dev; 1110 __entry->dev = mp->m_super->s_dev;
1105 __entry->agno = agno; 1111 __entry->agno = agno;
1106 __entry->slot = slot; 1112 __entry->agbno = agbno;
1107 __entry->found = found; 1113 __entry->len = len;
1108 ), 1114 ),
1109 TP_printk("dev %d:%d agno %u slot %d %s", 1115 TP_printk("dev %d:%d agno %u agbno %u len %u",
1110 MAJOR(__entry->dev), MINOR(__entry->dev), 1116 MAJOR(__entry->dev), MINOR(__entry->dev),
1111 __entry->agno, 1117 __entry->agno,
1112 __entry->slot, 1118 __entry->agbno,
1113 __print_symbolic(__entry->found, XFS_BUSY_STATES)) 1119 __entry->len)
1114); 1120);
1115 1121
1122#define XFS_BUSY_STATES \
1123 { 0, "missing" }, \
1124 { 1, "found" }
1125
1116TRACE_EVENT(xfs_alloc_busysearch, 1126TRACE_EVENT(xfs_alloc_busysearch,
1117 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, 1127 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1118 xfs_extlen_t len, xfs_lsn_t lsn), 1128 xfs_agblock_t agbno, xfs_extlen_t len, int found),
1119 TP_ARGS(mp, agno, agbno, len, lsn), 1129 TP_ARGS(mp, agno, agbno, len, found),
1120 TP_STRUCT__entry( 1130 TP_STRUCT__entry(
1121 __field(dev_t, dev) 1131 __field(dev_t, dev)
1122 __field(xfs_agnumber_t, agno) 1132 __field(xfs_agnumber_t, agno)
1123 __field(xfs_agblock_t, agbno) 1133 __field(xfs_agblock_t, agbno)
1124 __field(xfs_extlen_t, len) 1134 __field(xfs_extlen_t, len)
1125 __field(xfs_lsn_t, lsn) 1135 __field(int, found)
1126 ), 1136 ),
1127 TP_fast_assign( 1137 TP_fast_assign(
1128 __entry->dev = mp->m_super->s_dev; 1138 __entry->dev = mp->m_super->s_dev;
1129 __entry->agno = agno; 1139 __entry->agno = agno;
1130 __entry->agbno = agbno; 1140 __entry->agbno = agbno;
1131 __entry->len = len; 1141 __entry->len = len;
1132 __entry->lsn = lsn; 1142 __entry->found = found;
1133 ), 1143 ),
1134 TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx", 1144 TP_printk("dev %d:%d agno %u agbno %u len %u %s",
1135 MAJOR(__entry->dev), MINOR(__entry->dev), 1145 MAJOR(__entry->dev), MINOR(__entry->dev),
1136 __entry->agno, 1146 __entry->agno,
1137 __entry->agbno, 1147 __entry->agbno,
1138 __entry->len, 1148 __entry->len,
1149 __print_symbolic(__entry->found, XFS_BUSY_STATES))
1150);
1151
1152TRACE_EVENT(xfs_trans_commit_lsn,
1153 TP_PROTO(struct xfs_trans *trans),
1154 TP_ARGS(trans),
1155 TP_STRUCT__entry(
1156 __field(dev_t, dev)
1157 __field(struct xfs_trans *, tp)
1158 __field(xfs_lsn_t, lsn)
1159 ),
1160 TP_fast_assign(
1161 __entry->dev = trans->t_mountp->m_super->s_dev;
1162 __entry->tp = trans;
1163 __entry->lsn = trans->t_commit_lsn;
1164 ),
1165 TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
1166 MAJOR(__entry->dev), MINOR(__entry->dev),
1167 __entry->tp,
1139 __entry->lsn) 1168 __entry->lsn)
1140); 1169);
1141 1170
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index b89ec5df0129..585e7633dfc7 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -344,9 +344,9 @@ xfs_qm_init_dquot_blk(
344 for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) 344 for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
345 xfs_qm_dqinit_core(curid, type, d); 345 xfs_qm_dqinit_core(curid, type, d);
346 xfs_trans_dquot_buf(tp, bp, 346 xfs_trans_dquot_buf(tp, bp,
347 (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF : 347 (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
348 ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF : 348 ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
349 XFS_BLI_GDQUOT_BUF))); 349 XFS_BLF_GDQUOT_BUF)));
350 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); 350 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
351} 351}
352 352
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index abb8222b88c9..401f364ad36c 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -175,14 +175,20 @@ typedef struct xfs_agfl {
175} xfs_agfl_t; 175} xfs_agfl_t;
176 176
177/* 177/*
178 * Busy block/extent entry. Used in perag to mark blocks that have been freed 178 * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
179 * but whose transactions aren't committed to disk yet. 179 * have been freed but whose transactions aren't committed to disk yet.
180 *
181 * Note that we use the transaction ID to record the transaction, not the
182 * transaction structure itself. See xfs_alloc_busy_insert() for details.
180 */ 183 */
181typedef struct xfs_perag_busy { 184struct xfs_busy_extent {
182 xfs_agblock_t busy_start; 185 struct rb_node rb_node; /* ag by-bno indexed search tree */
183 xfs_extlen_t busy_length; 186 struct list_head list; /* transaction busy extent list */
184 struct xfs_trans *busy_tp; /* transaction that did the free */ 187 xfs_agnumber_t agno;
185} xfs_perag_busy_t; 188 xfs_agblock_t bno;
189 xfs_extlen_t length;
190 xlog_tid_t tid; /* transaction that created this */
191};
186 192
187/* 193/*
188 * Per-ag incore structure, copies of information in agf and agi, 194 * Per-ag incore structure, copies of information in agf and agi,
@@ -216,7 +222,8 @@ typedef struct xfs_perag {
216 xfs_agino_t pagl_leftrec; 222 xfs_agino_t pagl_leftrec;
217 xfs_agino_t pagl_rightrec; 223 xfs_agino_t pagl_rightrec;
218#ifdef __KERNEL__ 224#ifdef __KERNEL__
219 spinlock_t pagb_lock; /* lock for pagb_list */ 225 spinlock_t pagb_lock; /* lock for pagb_tree */
226 struct rb_root pagb_tree; /* ordered tree of busy extents */
220 227
221 atomic_t pagf_fstrms; /* # of filestreams active in this AG */ 228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
222 229
@@ -226,7 +233,6 @@ typedef struct xfs_perag {
226 int pag_ici_reclaimable; /* reclaimable inodes */ 233 int pag_ici_reclaimable; /* reclaimable inodes */
227#endif 234#endif
228 int pagb_count; /* pagb slots in use */ 235 int pagb_count; /* pagb slots in use */
229 xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
230} xfs_perag_t; 236} xfs_perag_t;
231 237
232/* 238/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 94cddbfb2560..a7fbe8a99b12 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -46,11 +46,9 @@
46#define XFSA_FIXUP_BNO_OK 1 46#define XFSA_FIXUP_BNO_OK 1
47#define XFSA_FIXUP_CNT_OK 2 47#define XFSA_FIXUP_CNT_OK 2
48 48
49STATIC void 49static int
50xfs_alloc_search_busy(xfs_trans_t *tp, 50xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
51 xfs_agnumber_t agno, 51 xfs_agblock_t bno, xfs_extlen_t len);
52 xfs_agblock_t bno,
53 xfs_extlen_t len);
54 52
55/* 53/*
56 * Prototypes for per-ag allocation routines 54 * Prototypes for per-ag allocation routines
@@ -540,9 +538,16 @@ xfs_alloc_ag_vextent(
540 be32_to_cpu(agf->agf_length)); 538 be32_to_cpu(agf->agf_length));
541 xfs_alloc_log_agf(args->tp, args->agbp, 539 xfs_alloc_log_agf(args->tp, args->agbp,
542 XFS_AGF_FREEBLKS); 540 XFS_AGF_FREEBLKS);
543 /* search the busylist for these blocks */ 541 /*
544 xfs_alloc_search_busy(args->tp, args->agno, 542 * Search the busylist for these blocks and mark the
545 args->agbno, args->len); 543 * transaction as synchronous if blocks are found. This
544 * avoids the need to block due to a synchronous log
545 * force to ensure correct ordering as the synchronous
546 * transaction will guarantee that for us.
547 */
548 if (xfs_alloc_busy_search(args->mp, args->agno,
549 args->agbno, args->len))
550 xfs_trans_set_sync(args->tp);
546 } 551 }
547 if (!args->isfl) 552 if (!args->isfl)
548 xfs_trans_mod_sb(args->tp, 553 xfs_trans_mod_sb(args->tp,
@@ -1693,7 +1698,7 @@ xfs_free_ag_extent(
1693 * when the iclog commits to disk. If a busy block is allocated, 1698 * when the iclog commits to disk. If a busy block is allocated,
1694 * the iclog is pushed up to the LSN that freed the block. 1699 * the iclog is pushed up to the LSN that freed the block.
1695 */ 1700 */
1696 xfs_alloc_mark_busy(tp, agno, bno, len); 1701 xfs_alloc_busy_insert(tp, agno, bno, len);
1697 return 0; 1702 return 0;
1698 1703
1699 error0: 1704 error0:
@@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist(
1989 *bnop = bno; 1994 *bnop = bno;
1990 1995
1991 /* 1996 /*
1992 * As blocks are freed, they are added to the per-ag busy list 1997 * As blocks are freed, they are added to the per-ag busy list and
1993 * and remain there until the freeing transaction is committed to 1998 * remain there until the freeing transaction is committed to disk.
1994 * disk. Now that we have allocated blocks, this list must be 1999 * Now that we have allocated blocks, this list must be searched to see
1995 * searched to see if a block is being reused. If one is, then 2000 * if a block is being reused. If one is, then the freeing transaction
1996 * the freeing transaction must be pushed to disk NOW by forcing 2001 * must be pushed to disk before this transaction.
1997 * to disk all iclogs up that transaction's LSN. 2002 *
2003 * We do this by setting the current transaction to a sync transaction
2004 * which guarantees that the freeing transaction is on disk before this
2005 * transaction. This is done instead of a synchronous log force here so
2006 * that we don't sit and wait with the AGF locked in the transaction
2007 * during the log force.
1998 */ 2008 */
1999 xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1); 2009 if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
2010 xfs_trans_set_sync(tp);
2000 return 0; 2011 return 0;
2001} 2012}
2002 2013
@@ -2201,7 +2212,7 @@ xfs_alloc_read_agf(
2201 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); 2212 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
2202 spin_lock_init(&pag->pagb_lock); 2213 spin_lock_init(&pag->pagb_lock);
2203 pag->pagb_count = 0; 2214 pag->pagb_count = 0;
2204 memset(pag->pagb_list, 0, sizeof(pag->pagb_list)); 2215 pag->pagb_tree = RB_ROOT;
2205 pag->pagf_init = 1; 2216 pag->pagf_init = 1;
2206 } 2217 }
2207#ifdef DEBUG 2218#ifdef DEBUG
@@ -2479,127 +2490,263 @@ error0:
2479 * list is reused, the transaction that freed it must be forced to disk 2490 * list is reused, the transaction that freed it must be forced to disk
2480 * before continuing to use the block. 2491 * before continuing to use the block.
2481 * 2492 *
2482 * xfs_alloc_mark_busy - add to the per-ag busy list 2493 * xfs_alloc_busy_insert - add to the per-ag busy list
2483 * xfs_alloc_clear_busy - remove an item from the per-ag busy list 2494 * xfs_alloc_busy_clear - remove an item from the per-ag busy list
2495 * xfs_alloc_busy_search - search for a busy extent
2496 */
2497
2498/*
2499 * Insert a new extent into the busy tree.
2500 *
2501 * The busy extent tree is indexed by the start block of the busy extent.
2502 * there can be multiple overlapping ranges in the busy extent tree but only
2503 * ever one entry at a given start block. The reason for this is that
2504 * multi-block extents can be freed, then smaller chunks of that extent
2505 * allocated and freed again before the first transaction commit is on disk.
2506 * If the exact same start block is freed a second time, we have to wait for
2507 * that busy extent to pass out of the tree before the new extent is inserted.
2508 * There are two main cases we have to handle here.
2509 *
2510 * The first case is a transaction that triggers a "free - allocate - free"
2511 * cycle. This can occur during btree manipulations as a btree block is freed
2512 * to the freelist, then allocated from the free list, then freed again. In
2513 * this case, the second extxpnet free is what triggers the duplicate and as
2514 * such the transaction IDs should match. Because the extent was allocated in
2515 * this transaction, the transaction must be marked as synchronous. This is
2516 * true for all cases where the free/alloc/free occurs in the one transaction,
2517 * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
2518 * This serves to catch violations of the second case quite effectively.
2519 *
2520 * The second case is where the free/alloc/free occur in different
2521 * transactions. In this case, the thread freeing the extent the second time
2522 * can't mark the extent busy immediately because it is already tracked in a
2523 * transaction that may be committing. When the log commit for the existing
2524 * busy extent completes, the busy extent will be removed from the tree. If we
2525 * allow the second busy insert to continue using that busy extent structure,
2526 * it can be freed before this transaction is safely in the log. Hence our
2527 * only option in this case is to force the log to remove the existing busy
2528 * extent from the list before we insert the new one with the current
2529 * transaction ID.
2530 *
2531 * The problem we are trying to avoid in the free-alloc-free in separate
2532 * transactions is most easily described with a timeline:
2533 *
2534 * Thread 1 Thread 2 Thread 3 xfslogd
2535 * xact alloc
2536 * free X
2537 * mark busy
2538 * commit xact
2539 * free xact
2540 * xact alloc
2541 * alloc X
2542 * busy search
2543 * mark xact sync
2544 * commit xact
2545 * free xact
2546 * force log
2547 * checkpoint starts
2548 * ....
2549 * xact alloc
2550 * free X
2551 * mark busy
2552 * finds match
2553 * *** KABOOM! ***
2554 * ....
2555 * log IO completes
2556 * unbusy X
2557 * checkpoint completes
2558 *
2559 * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
2560 * the checkpoint completes, and the busy extent it matched will have been
2561 * removed from the tree when it is woken. Hence it can then continue safely.
2562 *
2563 * However, to ensure this matching process is robust, we need to use the
2564 * transaction ID for identifying transaction, as delayed logging results in
2565 * the busy extent and transaction lifecycles being different. i.e. the busy
2566 * extent is active for a lot longer than the transaction. Hence the
2567 * transaction structure can be freed and reallocated, then mark the same
2568 * extent busy again in the new transaction. In this case the new transaction
2569 * will have a different tid but can have the same address, and hence we need
2570 * to check against the tid.
2571 *
2572 * Future: for delayed logging, we could avoid the log force if the extent was
2573 * first freed in the current checkpoint sequence. This, however, requires the
2574 * ability to pin the current checkpoint in memory until this transaction
2575 * commits to ensure that both the original free and the current one combine
2576 * logically into the one checkpoint. If the checkpoint sequences are
2577 * different, however, we still need to wait on a log force.
2484 */ 2578 */
2485void 2579void
2486xfs_alloc_mark_busy(xfs_trans_t *tp, 2580xfs_alloc_busy_insert(
2487 xfs_agnumber_t agno, 2581 struct xfs_trans *tp,
2488 xfs_agblock_t bno, 2582 xfs_agnumber_t agno,
2489 xfs_extlen_t len) 2583 xfs_agblock_t bno,
2584 xfs_extlen_t len)
2490{ 2585{
2491 xfs_perag_busy_t *bsy; 2586 struct xfs_busy_extent *new;
2587 struct xfs_busy_extent *busyp;
2492 struct xfs_perag *pag; 2588 struct xfs_perag *pag;
2493 int n; 2589 struct rb_node **rbp;
2590 struct rb_node *parent;
2591 int match;
2494 2592
2495 pag = xfs_perag_get(tp->t_mountp, agno);
2496 spin_lock(&pag->pagb_lock);
2497 2593
2498 /* search pagb_list for an open slot */ 2594 new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
2499 for (bsy = pag->pagb_list, n = 0; 2595 if (!new) {
2500 n < XFS_PAGB_NUM_SLOTS; 2596 /*
2501 bsy++, n++) { 2597 * No Memory! Since it is now not possible to track the free
2502 if (bsy->busy_tp == NULL) { 2598 * block, make this a synchronous transaction to insure that
2503 break; 2599 * the block is not reused before this transaction commits.
2504 } 2600 */
2601 trace_xfs_alloc_busy(tp, agno, bno, len, 1);
2602 xfs_trans_set_sync(tp);
2603 return;
2505 } 2604 }
2506 2605
2507 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n); 2606 new->agno = agno;
2607 new->bno = bno;
2608 new->length = len;
2609 new->tid = xfs_log_get_trans_ident(tp);
2508 2610
2509 if (n < XFS_PAGB_NUM_SLOTS) { 2611 INIT_LIST_HEAD(&new->list);
2510 bsy = &pag->pagb_list[n]; 2612
2511 pag->pagb_count++; 2613 /* trace before insert to be able to see failed inserts */
2512 bsy->busy_start = bno; 2614 trace_xfs_alloc_busy(tp, agno, bno, len, 0);
2513 bsy->busy_length = len; 2615
2514 bsy->busy_tp = tp; 2616 pag = xfs_perag_get(tp->t_mountp, new->agno);
2515 xfs_trans_add_busy(tp, agno, n); 2617restart:
2516 } else { 2618 spin_lock(&pag->pagb_lock);
2619 rbp = &pag->pagb_tree.rb_node;
2620 parent = NULL;
2621 busyp = NULL;
2622 match = 0;
2623 while (*rbp && match >= 0) {
2624 parent = *rbp;
2625 busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
2626
2627 if (new->bno < busyp->bno) {
2628 /* may overlap, but exact start block is lower */
2629 rbp = &(*rbp)->rb_left;
2630 if (new->bno + new->length > busyp->bno)
2631 match = busyp->tid == new->tid ? 1 : -1;
2632 } else if (new->bno > busyp->bno) {
2633 /* may overlap, but exact start block is higher */
2634 rbp = &(*rbp)->rb_right;
2635 if (bno < busyp->bno + busyp->length)
2636 match = busyp->tid == new->tid ? 1 : -1;
2637 } else {
2638 match = busyp->tid == new->tid ? 1 : -1;
2639 break;
2640 }
2641 }
2642 if (match < 0) {
2643 /* overlap marked busy in different transaction */
2644 spin_unlock(&pag->pagb_lock);
2645 xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
2646 goto restart;
2647 }
2648 if (match > 0) {
2517 /* 2649 /*
2518 * The busy list is full! Since it is now not possible to 2650 * overlap marked busy in same transaction. Update if exact
2519 * track the free block, make this a synchronous transaction 2651 * start block match, otherwise combine the busy extents into
2520 * to insure that the block is not reused before this 2652 * a single range.
2521 * transaction commits.
2522 */ 2653 */
2523 xfs_trans_set_sync(tp); 2654 if (busyp->bno == new->bno) {
2524 } 2655 busyp->length = max(busyp->length, new->length);
2656 spin_unlock(&pag->pagb_lock);
2657 ASSERT(tp->t_flags & XFS_TRANS_SYNC);
2658 xfs_perag_put(pag);
2659 kmem_free(new);
2660 return;
2661 }
2662 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2663 new->length = max(busyp->bno + busyp->length,
2664 new->bno + new->length) -
2665 min(busyp->bno, new->bno);
2666 new->bno = min(busyp->bno, new->bno);
2667 } else
2668 busyp = NULL;
2525 2669
2670 rb_link_node(&new->rb_node, parent, rbp);
2671 rb_insert_color(&new->rb_node, &pag->pagb_tree);
2672
2673 list_add(&new->list, &tp->t_busy);
2526 spin_unlock(&pag->pagb_lock); 2674 spin_unlock(&pag->pagb_lock);
2527 xfs_perag_put(pag); 2675 xfs_perag_put(pag);
2676 kmem_free(busyp);
2528} 2677}
2529 2678
2530void 2679/*
2531xfs_alloc_clear_busy(xfs_trans_t *tp, 2680 * Search for a busy extent within the range of the extent we are about to
2532 xfs_agnumber_t agno, 2681 * allocate. You need to be holding the busy extent tree lock when calling
2533 int idx) 2682 * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
2683 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
2684 * match. This is done so that a non-zero return indicates an overlap that
2685 * will require a synchronous transaction, but it can still be
2686 * used to distinguish between a partial or exact match.
2687 */
2688static int
2689xfs_alloc_busy_search(
2690 struct xfs_mount *mp,
2691 xfs_agnumber_t agno,
2692 xfs_agblock_t bno,
2693 xfs_extlen_t len)
2534{ 2694{
2535 struct xfs_perag *pag; 2695 struct xfs_perag *pag;
2536 xfs_perag_busy_t *list; 2696 struct rb_node *rbp;
2697 struct xfs_busy_extent *busyp;
2698 int match = 0;
2537 2699
2538 ASSERT(idx < XFS_PAGB_NUM_SLOTS); 2700 pag = xfs_perag_get(mp, agno);
2539 pag = xfs_perag_get(tp->t_mountp, agno);
2540 spin_lock(&pag->pagb_lock); 2701 spin_lock(&pag->pagb_lock);
2541 list = pag->pagb_list;
2542 2702
2543 trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp); 2703 rbp = pag->pagb_tree.rb_node;
2544 2704
2545 if (list[idx].busy_tp == tp) { 2705 /* find closest start bno overlap */
2546 list[idx].busy_tp = NULL; 2706 while (rbp) {
2547 pag->pagb_count--; 2707 busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
2708 if (bno < busyp->bno) {
2709 /* may overlap, but exact start block is lower */
2710 if (bno + len > busyp->bno)
2711 match = -1;
2712 rbp = rbp->rb_left;
2713 } else if (bno > busyp->bno) {
2714 /* may overlap, but exact start block is higher */
2715 if (bno < busyp->bno + busyp->length)
2716 match = -1;
2717 rbp = rbp->rb_right;
2718 } else {
2719 /* bno matches busyp, length determines exact match */
2720 match = (busyp->length == len) ? 1 : -1;
2721 break;
2722 }
2548 } 2723 }
2549
2550 spin_unlock(&pag->pagb_lock); 2724 spin_unlock(&pag->pagb_lock);
2725 trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
2551 xfs_perag_put(pag); 2726 xfs_perag_put(pag);
2727 return match;
2552} 2728}
2553 2729
2554 2730void
2555/* 2731xfs_alloc_busy_clear(
2556 * If we find the extent in the busy list, force the log out to get the 2732 struct xfs_mount *mp,
2557 * extent out of the busy list so the caller can use it straight away. 2733 struct xfs_busy_extent *busyp)
2558 */
2559STATIC void
2560xfs_alloc_search_busy(xfs_trans_t *tp,
2561 xfs_agnumber_t agno,
2562 xfs_agblock_t bno,
2563 xfs_extlen_t len)
2564{ 2734{
2565 struct xfs_perag *pag; 2735 struct xfs_perag *pag;
2566 xfs_perag_busy_t *bsy;
2567 xfs_agblock_t uend, bend;
2568 xfs_lsn_t lsn = 0;
2569 int cnt;
2570 2736
2571 pag = xfs_perag_get(tp->t_mountp, agno); 2737 trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
2572 spin_lock(&pag->pagb_lock); 2738 busyp->length);
2573 cnt = pag->pagb_count;
2574 2739
2575 /* 2740 ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
2576 * search pagb_list for this slot, skipping open slots. We have to 2741 busyp->length) == 1);
2577 * search the entire array as there may be multiple overlaps and
2578 * we have to get the most recent LSN for the log force to push out
2579 * all the transactions that span the range.
2580 */
2581 uend = bno + len - 1;
2582 for (cnt = 0; cnt < pag->pagb_count; cnt++) {
2583 bsy = &pag->pagb_list[cnt];
2584 if (!bsy->busy_tp)
2585 continue;
2586 2742
2587 bend = bsy->busy_start + bsy->busy_length - 1; 2743 list_del_init(&busyp->list);
2588 if (bno > bend || uend < bsy->busy_start)
2589 continue;
2590 2744
2591 /* (start1,length1) within (start2, length2) */ 2745 pag = xfs_perag_get(mp, busyp->agno);
2592 if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) 2746 spin_lock(&pag->pagb_lock);
2593 lsn = bsy->busy_tp->t_commit_lsn; 2747 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2594 }
2595 spin_unlock(&pag->pagb_lock); 2748 spin_unlock(&pag->pagb_lock);
2596 xfs_perag_put(pag); 2749 xfs_perag_put(pag);
2597 trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
2598 2750
2599 /* 2751 kmem_free(busyp);
2600 * If a block was found, force the log through the LSN of the
2601 * transaction that freed the block
2602 */
2603 if (lsn)
2604 xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
2605} 2752}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 599bffa39784..6d05199b667c 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -22,6 +22,7 @@ struct xfs_buf;
22struct xfs_mount; 22struct xfs_mount;
23struct xfs_perag; 23struct xfs_perag;
24struct xfs_trans; 24struct xfs_trans;
25struct xfs_busy_extent;
25 26
26/* 27/*
27 * Freespace allocation types. Argument to xfs_alloc_[v]extent. 28 * Freespace allocation types. Argument to xfs_alloc_[v]extent.
@@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
119#ifdef __KERNEL__ 120#ifdef __KERNEL__
120 121
121void 122void
122xfs_alloc_mark_busy(xfs_trans_t *tp, 123xfs_alloc_busy_insert(xfs_trans_t *tp,
123 xfs_agnumber_t agno, 124 xfs_agnumber_t agno,
124 xfs_agblock_t bno, 125 xfs_agblock_t bno,
125 xfs_extlen_t len); 126 xfs_extlen_t len);
126 127
127void 128void
128xfs_alloc_clear_busy(xfs_trans_t *tp, 129xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
129 xfs_agnumber_t ag,
130 int idx);
131 130
132#endif /* __KERNEL__ */ 131#endif /* __KERNEL__ */
133 132
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index b726e10d2c1c..83f494218759 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -134,7 +134,7 @@ xfs_allocbt_free_block(
134 * disk. If a busy block is allocated, the iclog is pushed up to the 134 * disk. If a busy block is allocated, the iclog is pushed up to the
135 * LSN that freed the block. 135 * LSN that freed the block.
136 */ 136 */
137 xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); 137 xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
138 xfs_trans_agbtree_delta(cur->bc_tp, -1); 138 xfs_trans_agbtree_delta(cur->bc_tp, -1);
139 return 0; 139 return 0;
140} 140}
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 240340a4727b..02a80984aa05 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -64,7 +64,7 @@ xfs_buf_item_log_debug(
64 nbytes = last - first + 1; 64 nbytes = last - first + 1;
65 bfset(bip->bli_logged, first, nbytes); 65 bfset(bip->bli_logged, first, nbytes);
66 for (x = 0; x < nbytes; x++) { 66 for (x = 0; x < nbytes; x++) {
67 chunk_num = byte >> XFS_BLI_SHIFT; 67 chunk_num = byte >> XFS_BLF_SHIFT;
68 word_num = chunk_num >> BIT_TO_WORD_SHIFT; 68 word_num = chunk_num >> BIT_TO_WORD_SHIFT;
69 bit_num = chunk_num & (NBWORD - 1); 69 bit_num = chunk_num & (NBWORD - 1);
70 wordp = &(bip->bli_format.blf_data_map[word_num]); 70 wordp = &(bip->bli_format.blf_data_map[word_num]);
@@ -166,7 +166,7 @@ xfs_buf_item_size(
166 * cancel flag in it. 166 * cancel flag in it.
167 */ 167 */
168 trace_xfs_buf_item_size_stale(bip); 168 trace_xfs_buf_item_size_stale(bip);
169 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 169 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
170 return 1; 170 return 1;
171 } 171 }
172 172
@@ -197,9 +197,9 @@ xfs_buf_item_size(
197 } else if (next_bit != last_bit + 1) { 197 } else if (next_bit != last_bit + 1) {
198 last_bit = next_bit; 198 last_bit = next_bit;
199 nvecs++; 199 nvecs++;
200 } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) != 200 } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
201 (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) + 201 (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
202 XFS_BLI_CHUNK)) { 202 XFS_BLF_CHUNK)) {
203 last_bit = next_bit; 203 last_bit = next_bit;
204 nvecs++; 204 nvecs++;
205 } else { 205 } else {
@@ -254,6 +254,20 @@ xfs_buf_item_format(
254 vecp++; 254 vecp++;
255 nvecs = 1; 255 nvecs = 1;
256 256
257 /*
258 * If it is an inode buffer, transfer the in-memory state to the
259 * format flags and clear the in-memory state. We do not transfer
260 * this state if the inode buffer allocation has not yet been committed
261 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
262 * correct replay of the inode allocation.
263 */
264 if (bip->bli_flags & XFS_BLI_INODE_BUF) {
265 if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
266 xfs_log_item_in_current_chkpt(&bip->bli_item)))
267 bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
268 bip->bli_flags &= ~XFS_BLI_INODE_BUF;
269 }
270
257 if (bip->bli_flags & XFS_BLI_STALE) { 271 if (bip->bli_flags & XFS_BLI_STALE) {
258 /* 272 /*
259 * The buffer is stale, so all we need to log 273 * The buffer is stale, so all we need to log
@@ -261,7 +275,7 @@ xfs_buf_item_format(
261 * cancel flag in it. 275 * cancel flag in it.
262 */ 276 */
263 trace_xfs_buf_item_format_stale(bip); 277 trace_xfs_buf_item_format_stale(bip);
264 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 278 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
265 bip->bli_format.blf_size = nvecs; 279 bip->bli_format.blf_size = nvecs;
266 return; 280 return;
267 } 281 }
@@ -294,28 +308,28 @@ xfs_buf_item_format(
294 * keep counting and scanning. 308 * keep counting and scanning.
295 */ 309 */
296 if (next_bit == -1) { 310 if (next_bit == -1) {
297 buffer_offset = first_bit * XFS_BLI_CHUNK; 311 buffer_offset = first_bit * XFS_BLF_CHUNK;
298 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 312 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
299 vecp->i_len = nbits * XFS_BLI_CHUNK; 313 vecp->i_len = nbits * XFS_BLF_CHUNK;
300 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 314 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
301 nvecs++; 315 nvecs++;
302 break; 316 break;
303 } else if (next_bit != last_bit + 1) { 317 } else if (next_bit != last_bit + 1) {
304 buffer_offset = first_bit * XFS_BLI_CHUNK; 318 buffer_offset = first_bit * XFS_BLF_CHUNK;
305 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 319 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
306 vecp->i_len = nbits * XFS_BLI_CHUNK; 320 vecp->i_len = nbits * XFS_BLF_CHUNK;
307 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 321 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
308 nvecs++; 322 nvecs++;
309 vecp++; 323 vecp++;
310 first_bit = next_bit; 324 first_bit = next_bit;
311 last_bit = next_bit; 325 last_bit = next_bit;
312 nbits = 1; 326 nbits = 1;
313 } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) != 327 } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) !=
314 (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) + 328 (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) +
315 XFS_BLI_CHUNK)) { 329 XFS_BLF_CHUNK)) {
316 buffer_offset = first_bit * XFS_BLI_CHUNK; 330 buffer_offset = first_bit * XFS_BLF_CHUNK;
317 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 331 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
318 vecp->i_len = nbits * XFS_BLI_CHUNK; 332 vecp->i_len = nbits * XFS_BLF_CHUNK;
319 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 333 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
320/* You would think we need to bump the nvecs here too, but we do not 334/* You would think we need to bump the nvecs here too, but we do not
321 * this number is used by recovery, and it gets confused by the boundary 335 * this number is used by recovery, and it gets confused by the boundary
@@ -341,10 +355,15 @@ xfs_buf_item_format(
341} 355}
342 356
343/* 357/*
344 * This is called to pin the buffer associated with the buf log 358 * This is called to pin the buffer associated with the buf log item in memory
345 * item in memory so it cannot be written out. Simply call bpin() 359 * so it cannot be written out. Simply call bpin() on the buffer to do this.
346 * on the buffer to do this. 360 *
361 * We also always take a reference to the buffer log item here so that the bli
362 * is held while the item is pinned in memory. This means that we can
363 * unconditionally drop the reference count a transaction holds when the
364 * transaction is completed.
347 */ 365 */
366
348STATIC void 367STATIC void
349xfs_buf_item_pin( 368xfs_buf_item_pin(
350 xfs_buf_log_item_t *bip) 369 xfs_buf_log_item_t *bip)
@@ -356,6 +375,7 @@ xfs_buf_item_pin(
356 ASSERT(atomic_read(&bip->bli_refcount) > 0); 375 ASSERT(atomic_read(&bip->bli_refcount) > 0);
357 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 376 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
358 (bip->bli_flags & XFS_BLI_STALE)); 377 (bip->bli_flags & XFS_BLI_STALE));
378 atomic_inc(&bip->bli_refcount);
359 trace_xfs_buf_item_pin(bip); 379 trace_xfs_buf_item_pin(bip);
360 xfs_bpin(bp); 380 xfs_bpin(bp);
361} 381}
@@ -393,7 +413,7 @@ xfs_buf_item_unpin(
393 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 413 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
394 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); 414 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
395 ASSERT(XFS_BUF_ISSTALE(bp)); 415 ASSERT(XFS_BUF_ISSTALE(bp));
396 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 416 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
397 trace_xfs_buf_item_unpin_stale(bip); 417 trace_xfs_buf_item_unpin_stale(bip);
398 418
399 /* 419 /*
@@ -489,20 +509,23 @@ xfs_buf_item_trylock(
489} 509}
490 510
491/* 511/*
492 * Release the buffer associated with the buf log item. 512 * Release the buffer associated with the buf log item. If there is no dirty
493 * If there is no dirty logged data associated with the 513 * logged data associated with the buffer recorded in the buf log item, then
494 * buffer recorded in the buf log item, then free the 514 * free the buf log item and remove the reference to it in the buffer.
495 * buf log item and remove the reference to it in the 515 *
496 * buffer. 516 * This call ignores the recursion count. It is only called when the buffer
517 * should REALLY be unlocked, regardless of the recursion count.
497 * 518 *
498 * This call ignores the recursion count. It is only called 519 * We unconditionally drop the transaction's reference to the log item. If the
499 * when the buffer should REALLY be unlocked, regardless 520 * item was logged, then another reference was taken when it was pinned, so we
500 * of the recursion count. 521 * can safely drop the transaction reference now. This also allows us to avoid
522 * potential races with the unpin code freeing the bli by not referencing the
523 * bli after we've dropped the reference count.
501 * 524 *
502 * If the XFS_BLI_HOLD flag is set in the buf log item, then 525 * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
503 * free the log item if necessary but do not unlock the buffer. 526 * if necessary but do not unlock the buffer. This is for support of
504 * This is for support of xfs_trans_bhold(). Make sure the 527 * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
505 * XFS_BLI_HOLD field is cleared if we don't free the item. 528 * free the item.
506 */ 529 */
507STATIC void 530STATIC void
508xfs_buf_item_unlock( 531xfs_buf_item_unlock(
@@ -514,73 +537,54 @@ xfs_buf_item_unlock(
514 537
515 bp = bip->bli_buf; 538 bp = bip->bli_buf;
516 539
517 /* 540 /* Clear the buffer's association with this transaction. */
518 * Clear the buffer's association with this transaction.
519 */
520 XFS_BUF_SET_FSPRIVATE2(bp, NULL); 541 XFS_BUF_SET_FSPRIVATE2(bp, NULL);
521 542
522 /* 543 /*
523 * If this is a transaction abort, don't return early. 544 * If this is a transaction abort, don't return early. Instead, allow
524 * Instead, allow the brelse to happen. 545 * the brelse to happen. Normally it would be done for stale
525 * Normally it would be done for stale (cancelled) buffers 546 * (cancelled) buffers at unpin time, but we'll never go through the
526 * at unpin time, but we'll never go through the pin/unpin 547 * pin/unpin cycle if we abort inside commit.
527 * cycle if we abort inside commit.
528 */ 548 */
529 aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; 549 aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
530 550
531 /* 551 /*
532 * If the buf item is marked stale, then don't do anything. 552 * Before possibly freeing the buf item, determine if we should
533 * We'll unlock the buffer and free the buf item when the 553 * release the buffer at the end of this routine.
534 * buffer is unpinned for the last time.
535 */ 554 */
536 if (bip->bli_flags & XFS_BLI_STALE) { 555 hold = bip->bli_flags & XFS_BLI_HOLD;
537 bip->bli_flags &= ~XFS_BLI_LOGGED; 556
538 trace_xfs_buf_item_unlock_stale(bip); 557 /* Clear the per transaction state. */
539 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 558 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
540 if (!aborted)
541 return;
542 }
543 559
544 /* 560 /*
545 * Drop the transaction's reference to the log item if 561 * If the buf item is marked stale, then don't do anything. We'll
546 * it was not logged as part of the transaction. Otherwise 562 * unlock the buffer and free the buf item when the buffer is unpinned
547 * we'll drop the reference in xfs_buf_item_unpin() when 563 * for the last time.
548 * the transaction is really through with the buffer.
549 */ 564 */
550 if (!(bip->bli_flags & XFS_BLI_LOGGED)) { 565 if (bip->bli_flags & XFS_BLI_STALE) {
551 atomic_dec(&bip->bli_refcount); 566 trace_xfs_buf_item_unlock_stale(bip);
552 } else { 567 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
553 /* 568 if (!aborted) {
554 * Clear the logged flag since this is per 569 atomic_dec(&bip->bli_refcount);
555 * transaction state. 570 return;
556 */ 571 }
557 bip->bli_flags &= ~XFS_BLI_LOGGED;
558 } 572 }
559 573
560 /*
561 * Before possibly freeing the buf item, determine if we should
562 * release the buffer at the end of this routine.
563 */
564 hold = bip->bli_flags & XFS_BLI_HOLD;
565 trace_xfs_buf_item_unlock(bip); 574 trace_xfs_buf_item_unlock(bip);
566 575
567 /* 576 /*
568 * If the buf item isn't tracking any data, free it. 577 * If the buf item isn't tracking any data, free it, otherwise drop the
569 * Otherwise, if XFS_BLI_HOLD is set clear it. 578 * reference we hold to it.
570 */ 579 */
571 if (xfs_bitmap_empty(bip->bli_format.blf_data_map, 580 if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
572 bip->bli_format.blf_map_size)) { 581 bip->bli_format.blf_map_size))
573 xfs_buf_item_relse(bp); 582 xfs_buf_item_relse(bp);
574 } else if (hold) { 583 else
575 bip->bli_flags &= ~XFS_BLI_HOLD; 584 atomic_dec(&bip->bli_refcount);
576 }
577 585
578 /* 586 if (!hold)
579 * Release the buffer if XFS_BLI_HOLD was not set.
580 */
581 if (!hold) {
582 xfs_buf_relse(bp); 587 xfs_buf_relse(bp);
583 }
584} 588}
585 589
586/* 590/*
@@ -717,12 +721,12 @@ xfs_buf_item_init(
717 } 721 }
718 722
719 /* 723 /*
720 * chunks is the number of XFS_BLI_CHUNK size pieces 724 * chunks is the number of XFS_BLF_CHUNK size pieces
721 * the buffer can be divided into. Make sure not to 725 * the buffer can be divided into. Make sure not to
722 * truncate any pieces. map_size is the size of the 726 * truncate any pieces. map_size is the size of the
723 * bitmap needed to describe the chunks of the buffer. 727 * bitmap needed to describe the chunks of the buffer.
724 */ 728 */
725 chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT); 729 chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT);
726 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); 730 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
727 731
728 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, 732 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
@@ -790,8 +794,8 @@ xfs_buf_item_log(
790 /* 794 /*
791 * Convert byte offsets to bit numbers. 795 * Convert byte offsets to bit numbers.
792 */ 796 */
793 first_bit = first >> XFS_BLI_SHIFT; 797 first_bit = first >> XFS_BLF_SHIFT;
794 last_bit = last >> XFS_BLI_SHIFT; 798 last_bit = last >> XFS_BLF_SHIFT;
795 799
796 /* 800 /*
797 * Calculate the total number of bits to be set. 801 * Calculate the total number of bits to be set.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index df4454511f73..f20bb472d582 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format {
41 * This flag indicates that the buffer contains on disk inodes 41 * This flag indicates that the buffer contains on disk inodes
42 * and requires special recovery handling. 42 * and requires special recovery handling.
43 */ 43 */
44#define XFS_BLI_INODE_BUF 0x1 44#define XFS_BLF_INODE_BUF 0x1
45/* 45/*
46 * This flag indicates that the buffer should not be replayed 46 * This flag indicates that the buffer should not be replayed
47 * during recovery because its blocks are being freed. 47 * during recovery because its blocks are being freed.
48 */ 48 */
49#define XFS_BLI_CANCEL 0x2 49#define XFS_BLF_CANCEL 0x2
50/* 50/*
51 * This flag indicates that the buffer contains on disk 51 * This flag indicates that the buffer contains on disk
52 * user or group dquots and may require special recovery handling. 52 * user or group dquots and may require special recovery handling.
53 */ 53 */
54#define XFS_BLI_UDQUOT_BUF 0x4 54#define XFS_BLF_UDQUOT_BUF 0x4
55#define XFS_BLI_PDQUOT_BUF 0x8 55#define XFS_BLF_PDQUOT_BUF 0x8
56#define XFS_BLI_GDQUOT_BUF 0x10 56#define XFS_BLF_GDQUOT_BUF 0x10
57 57
58#define XFS_BLI_CHUNK 128 58#define XFS_BLF_CHUNK 128
59#define XFS_BLI_SHIFT 7 59#define XFS_BLF_SHIFT 7
60#define BIT_TO_WORD_SHIFT 5 60#define BIT_TO_WORD_SHIFT 5
61#define NBWORD (NBBY * sizeof(unsigned int)) 61#define NBWORD (NBBY * sizeof(unsigned int))
62 62
@@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format {
69#define XFS_BLI_LOGGED 0x08 69#define XFS_BLI_LOGGED 0x08
70#define XFS_BLI_INODE_ALLOC_BUF 0x10 70#define XFS_BLI_INODE_ALLOC_BUF 0x10
71#define XFS_BLI_STALE_INODE 0x20 71#define XFS_BLI_STALE_INODE 0x20
72#define XFS_BLI_INODE_BUF 0x40
72 73
73#define XFS_BLI_FLAGS \ 74#define XFS_BLI_FLAGS \
74 { XFS_BLI_HOLD, "HOLD" }, \ 75 { XFS_BLI_HOLD, "HOLD" }, \
@@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format {
76 { XFS_BLI_STALE, "STALE" }, \ 77 { XFS_BLI_STALE, "STALE" }, \
77 { XFS_BLI_LOGGED, "LOGGED" }, \ 78 { XFS_BLI_LOGGED, "LOGGED" }, \
78 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ 79 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
79 { XFS_BLI_STALE_INODE, "STALE_INODE" } 80 { XFS_BLI_STALE_INODE, "STALE_INODE" }, \
81 { XFS_BLI_INODE_BUF, "INODE_BUF" }
80 82
81 83
82#ifdef __KERNEL__ 84#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ef96175c0744..047b8a8e5c29 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -170,7 +170,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
170 va_list ap; 170 va_list ap;
171 171
172#ifdef DEBUG 172#ifdef DEBUG
173 xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT; 173 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
174#endif 174#endif
175 175
176 if (xfs_panic_mask && (xfs_panic_mask & panic_tag) 176 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3038dd52c72a..5215abc8023a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -54,9 +54,6 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
54STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); 54STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
55STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 55STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
56STATIC void xlog_dealloc_log(xlog_t *log); 56STATIC void xlog_dealloc_log(xlog_t *log);
57STATIC int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
58 struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
59 xlog_in_core_t **commit_iclog, uint flags);
60 57
61/* local state machine functions */ 58/* local state machine functions */
62STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); 59STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
@@ -86,14 +83,6 @@ STATIC int xlog_regrant_write_log_space(xlog_t *log,
86STATIC void xlog_ungrant_log_space(xlog_t *log, 83STATIC void xlog_ungrant_log_space(xlog_t *log,
87 xlog_ticket_t *ticket); 84 xlog_ticket_t *ticket);
88 85
89
90/* local ticket functions */
91STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log,
92 int unit_bytes,
93 int count,
94 char clientid,
95 uint flags);
96
97#if defined(DEBUG) 86#if defined(DEBUG)
98STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); 87STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
99STATIC void xlog_verify_grant_head(xlog_t *log, int equals); 88STATIC void xlog_verify_grant_head(xlog_t *log, int equals);
@@ -360,6 +349,15 @@ xfs_log_reserve(
360 ASSERT(flags & XFS_LOG_PERM_RESERV); 349 ASSERT(flags & XFS_LOG_PERM_RESERV);
361 internal_ticket = *ticket; 350 internal_ticket = *ticket;
362 351
352 /*
353 * this is a new transaction on the ticket, so we need to
354 * change the transaction ID so that the next transaction has a
355 * different TID in the log. Just add one to the existing tid
356 * so that we can see chains of rolling transactions in the log
357 * easily.
358 */
359 internal_ticket->t_tid++;
360
363 trace_xfs_log_reserve(log, internal_ticket); 361 trace_xfs_log_reserve(log, internal_ticket);
364 362
365 xlog_grant_push_ail(mp, internal_ticket->t_unit_res); 363 xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
@@ -367,7 +365,8 @@ xfs_log_reserve(
367 } else { 365 } else {
368 /* may sleep if need to allocate more tickets */ 366 /* may sleep if need to allocate more tickets */
369 internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt, 367 internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
370 client, flags); 368 client, flags,
369 KM_SLEEP|KM_MAYFAIL);
371 if (!internal_ticket) 370 if (!internal_ticket)
372 return XFS_ERROR(ENOMEM); 371 return XFS_ERROR(ENOMEM);
373 internal_ticket->t_trans_type = t_type; 372 internal_ticket->t_trans_type = t_type;
@@ -452,6 +451,13 @@ xfs_log_mount(
452 /* Normal transactions can now occur */ 451 /* Normal transactions can now occur */
453 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 452 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
454 453
454 /*
455 * Now the log has been fully initialised and we know were our
456 * space grant counters are, we can initialise the permanent ticket
457 * needed for delayed logging to work.
458 */
459 xlog_cil_init_post_recovery(mp->m_log);
460
455 return 0; 461 return 0;
456 462
457out_destroy_ail: 463out_destroy_ail:
@@ -658,6 +664,10 @@ xfs_log_item_init(
658 item->li_ailp = mp->m_ail; 664 item->li_ailp = mp->m_ail;
659 item->li_type = type; 665 item->li_type = type;
660 item->li_ops = ops; 666 item->li_ops = ops;
667 item->li_lv = NULL;
668
669 INIT_LIST_HEAD(&item->li_ail);
670 INIT_LIST_HEAD(&item->li_cil);
661} 671}
662 672
663/* 673/*
@@ -1168,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t *mp,
1168 *iclogp = log->l_iclog; /* complete ring */ 1178 *iclogp = log->l_iclog; /* complete ring */
1169 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ 1179 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
1170 1180
1181 error = xlog_cil_init(log);
1182 if (error)
1183 goto out_free_iclog;
1171 return log; 1184 return log;
1172 1185
1173out_free_iclog: 1186out_free_iclog:
@@ -1494,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log)
1494 xlog_in_core_t *iclog, *next_iclog; 1507 xlog_in_core_t *iclog, *next_iclog;
1495 int i; 1508 int i;
1496 1509
1510 xlog_cil_destroy(log);
1511
1497 iclog = log->l_iclog; 1512 iclog = log->l_iclog;
1498 for (i=0; i<log->l_iclog_bufs; i++) { 1513 for (i=0; i<log->l_iclog_bufs; i++) {
1499 sv_destroy(&iclog->ic_force_wait); 1514 sv_destroy(&iclog->ic_force_wait);
@@ -1536,8 +1551,10 @@ xlog_state_finish_copy(xlog_t *log,
1536 * print out info relating to regions written which consume 1551 * print out info relating to regions written which consume
1537 * the reservation 1552 * the reservation
1538 */ 1553 */
1539STATIC void 1554void
1540xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) 1555xlog_print_tic_res(
1556 struct xfs_mount *mp,
1557 struct xlog_ticket *ticket)
1541{ 1558{
1542 uint i; 1559 uint i;
1543 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); 1560 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
@@ -1637,6 +1654,10 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1637 "bad-rtype" : res_type_str[r_type-1]), 1654 "bad-rtype" : res_type_str[r_type-1]),
1638 ticket->t_res_arr[i].r_len); 1655 ticket->t_res_arr[i].r_len);
1639 } 1656 }
1657
1658 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
1659 "xfs_log_write: reservation ran out. Need to up reservation");
1660 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1640} 1661}
1641 1662
1642/* 1663/*
@@ -1865,7 +1886,7 @@ xlog_write_copy_finish(
1865 * we don't update ic_offset until the end when we know exactly how many 1886 * we don't update ic_offset until the end when we know exactly how many
1866 * bytes have been written out. 1887 * bytes have been written out.
1867 */ 1888 */
1868STATIC int 1889int
1869xlog_write( 1890xlog_write(
1870 struct log *log, 1891 struct log *log,
1871 struct xfs_log_vec *log_vector, 1892 struct xfs_log_vec *log_vector,
@@ -1889,22 +1910,26 @@ xlog_write(
1889 *start_lsn = 0; 1910 *start_lsn = 0;
1890 1911
1891 len = xlog_write_calc_vec_length(ticket, log_vector); 1912 len = xlog_write_calc_vec_length(ticket, log_vector);
1892 if (ticket->t_curr_res < len) { 1913 if (log->l_cilp) {
1893 xlog_print_tic_res(log->l_mp, ticket); 1914 /*
1894#ifdef DEBUG 1915 * Region headers and bytes are already accounted for.
1895 xlog_panic( 1916 * We only need to take into account start records and
1896 "xfs_log_write: reservation ran out. Need to up reservation"); 1917 * split regions in this function.
1897#else 1918 */
1898 /* Customer configurable panic */ 1919 if (ticket->t_flags & XLOG_TIC_INITED)
1899 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, log->l_mp, 1920 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1900 "xfs_log_write: reservation ran out. Need to up reservation");
1901 1921
1902 /* If we did not panic, shutdown the filesystem */ 1922 /*
1903 xfs_force_shutdown(log->l_mp, SHUTDOWN_CORRUPT_INCORE); 1923 * Commit record headers need to be accounted for. These
1904#endif 1924 * come in as separate writes so are easy to detect.
1905 } 1925 */
1926 if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
1927 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1928 } else
1929 ticket->t_curr_res -= len;
1906 1930
1907 ticket->t_curr_res -= len; 1931 if (ticket->t_curr_res < 0)
1932 xlog_print_tic_res(log->l_mp, ticket);
1908 1933
1909 index = 0; 1934 index = 0;
1910 lv = log_vector; 1935 lv = log_vector;
@@ -3000,6 +3025,8 @@ _xfs_log_force(
3000 3025
3001 XFS_STATS_INC(xs_log_force); 3026 XFS_STATS_INC(xs_log_force);
3002 3027
3028 xlog_cil_push(log, 1);
3029
3003 spin_lock(&log->l_icloglock); 3030 spin_lock(&log->l_icloglock);
3004 3031
3005 iclog = log->l_iclog; 3032 iclog = log->l_iclog;
@@ -3149,6 +3176,12 @@ _xfs_log_force_lsn(
3149 3176
3150 XFS_STATS_INC(xs_log_force); 3177 XFS_STATS_INC(xs_log_force);
3151 3178
3179 if (log->l_cilp) {
3180 lsn = xlog_cil_push_lsn(log, lsn);
3181 if (lsn == NULLCOMMITLSN)
3182 return 0;
3183 }
3184
3152try_again: 3185try_again:
3153 spin_lock(&log->l_icloglock); 3186 spin_lock(&log->l_icloglock);
3154 iclog = log->l_iclog; 3187 iclog = log->l_iclog;
@@ -3313,22 +3346,30 @@ xfs_log_ticket_get(
3313 return ticket; 3346 return ticket;
3314} 3347}
3315 3348
3349xlog_tid_t
3350xfs_log_get_trans_ident(
3351 struct xfs_trans *tp)
3352{
3353 return tp->t_ticket->t_tid;
3354}
3355
3316/* 3356/*
3317 * Allocate and initialise a new log ticket. 3357 * Allocate and initialise a new log ticket.
3318 */ 3358 */
3319STATIC xlog_ticket_t * 3359xlog_ticket_t *
3320xlog_ticket_alloc( 3360xlog_ticket_alloc(
3321 struct log *log, 3361 struct log *log,
3322 int unit_bytes, 3362 int unit_bytes,
3323 int cnt, 3363 int cnt,
3324 char client, 3364 char client,
3325 uint xflags) 3365 uint xflags,
3366 int alloc_flags)
3326{ 3367{
3327 struct xlog_ticket *tic; 3368 struct xlog_ticket *tic;
3328 uint num_headers; 3369 uint num_headers;
3329 int iclog_space; 3370 int iclog_space;
3330 3371
3331 tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL); 3372 tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
3332 if (!tic) 3373 if (!tic)
3333 return NULL; 3374 return NULL;
3334 3375
@@ -3647,6 +3688,11 @@ xlog_state_ioerror(
3647 * c. nothing new gets queued up after (a) and (b) are done. 3688 * c. nothing new gets queued up after (a) and (b) are done.
3648 * d. if !logerror, flush the iclogs to disk, then seal them off 3689 * d. if !logerror, flush the iclogs to disk, then seal them off
3649 * for business. 3690 * for business.
3691 *
3692 * Note: for delayed logging the !logerror case needs to flush the regions
3693 * held in memory out to the iclogs before flushing them to disk. This needs
3694 * to be done before the log is marked as shutdown, otherwise the flush to the
3695 * iclogs will fail.
3650 */ 3696 */
3651int 3697int
3652xfs_log_force_umount( 3698xfs_log_force_umount(
@@ -3680,6 +3726,16 @@ xfs_log_force_umount(
3680 return 1; 3726 return 1;
3681 } 3727 }
3682 retval = 0; 3728 retval = 0;
3729
3730 /*
3731 * Flush the in memory commit item list before marking the log as
3732 * being shut down. We need to do it in this order to ensure all the
3733 * completed transactions are flushed to disk with the xfs_log_force()
3734 * call below.
3735 */
3736 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
3737 xlog_cil_push(log, 1);
3738
3683 /* 3739 /*
3684 * We must hold both the GRANT lock and the LOG lock, 3740 * We must hold both the GRANT lock and the LOG lock,
3685 * before we mark the filesystem SHUTDOWN and wake 3741 * before we mark the filesystem SHUTDOWN and wake
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 229d1f36ba9a..04c78e642cc8 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -19,7 +19,6 @@
19#define __XFS_LOG_H__ 19#define __XFS_LOG_H__
20 20
21/* get lsn fields */ 21/* get lsn fields */
22
23#define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) 22#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
24#define BLOCK_LSN(lsn) ((uint)(lsn)) 23#define BLOCK_LSN(lsn) ((uint)(lsn))
25 24
@@ -114,6 +113,9 @@ struct xfs_log_vec {
114 struct xfs_log_vec *lv_next; /* next lv in build list */ 113 struct xfs_log_vec *lv_next; /* next lv in build list */
115 int lv_niovecs; /* number of iovecs in lv */ 114 int lv_niovecs; /* number of iovecs in lv */
116 struct xfs_log_iovec *lv_iovecp; /* iovec array */ 115 struct xfs_log_iovec *lv_iovecp; /* iovec array */
116 struct xfs_log_item *lv_item; /* owner */
117 char *lv_buf; /* formatted buffer */
118 int lv_buf_len; /* size of formatted buffer */
117}; 119};
118 120
119/* 121/*
@@ -134,6 +136,7 @@ struct xlog_in_core;
134struct xlog_ticket; 136struct xlog_ticket;
135struct xfs_log_item; 137struct xfs_log_item;
136struct xfs_item_ops; 138struct xfs_item_ops;
139struct xfs_trans;
137 140
138void xfs_log_item_init(struct xfs_mount *mp, 141void xfs_log_item_init(struct xfs_mount *mp,
139 struct xfs_log_item *item, 142 struct xfs_log_item *item,
@@ -187,9 +190,16 @@ int xfs_log_need_covered(struct xfs_mount *mp);
187 190
188void xlog_iodone(struct xfs_buf *); 191void xlog_iodone(struct xfs_buf *);
189 192
190struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket); 193struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
191void xfs_log_ticket_put(struct xlog_ticket *ticket); 194void xfs_log_ticket_put(struct xlog_ticket *ticket);
192 195
196xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
197
198int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
199 struct xfs_log_vec *log_vector,
200 xfs_lsn_t *commit_lsn, int flags);
201bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
202
193#endif 203#endif
194 204
195 205
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
new file mode 100644
index 000000000000..bb17cc044bf3
--- /dev/null
+++ b/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,725 @@
1/*
2 * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write the Free Software Foundation,
15 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
26#include "xfs_log_priv.h"
27#include "xfs_sb.h"
28#include "xfs_ag.h"
29#include "xfs_dir2.h"
30#include "xfs_dmapi.h"
31#include "xfs_mount.h"
32#include "xfs_error.h"
33#include "xfs_alloc.h"
34
35/*
36 * Perform initial CIL structure initialisation. If the CIL is not
37 * enabled in this filesystem, ensure the log->l_cilp is null so
38 * we can check this conditional to determine if we are doing delayed
39 * logging or not.
40 */
41int
42xlog_cil_init(
43 struct log *log)
44{
45 struct xfs_cil *cil;
46 struct xfs_cil_ctx *ctx;
47
48 log->l_cilp = NULL;
49 if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
50 return 0;
51
52 cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
53 if (!cil)
54 return ENOMEM;
55
56 ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
57 if (!ctx) {
58 kmem_free(cil);
59 return ENOMEM;
60 }
61
62 INIT_LIST_HEAD(&cil->xc_cil);
63 INIT_LIST_HEAD(&cil->xc_committing);
64 spin_lock_init(&cil->xc_cil_lock);
65 init_rwsem(&cil->xc_ctx_lock);
66 sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
67
68 INIT_LIST_HEAD(&ctx->committing);
69 INIT_LIST_HEAD(&ctx->busy_extents);
70 ctx->sequence = 1;
71 ctx->cil = cil;
72 cil->xc_ctx = ctx;
73
74 cil->xc_log = log;
75 log->l_cilp = cil;
76 return 0;
77}
78
79void
80xlog_cil_destroy(
81 struct log *log)
82{
83 if (!log->l_cilp)
84 return;
85
86 if (log->l_cilp->xc_ctx) {
87 if (log->l_cilp->xc_ctx->ticket)
88 xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
89 kmem_free(log->l_cilp->xc_ctx);
90 }
91
92 ASSERT(list_empty(&log->l_cilp->xc_cil));
93 kmem_free(log->l_cilp);
94}
95
96/*
97 * Allocate a new ticket. Failing to get a new ticket makes it really hard to
98 * recover, so we don't allow failure here. Also, we allocate in a context that
99 * we don't want to be issuing transactions from, so we need to tell the
100 * allocation code this as well.
101 *
102 * We don't reserve any space for the ticket - we are going to steal whatever
103 * space we require from transactions as they commit. To ensure we reserve all
104 * the space required, we need to set the current reservation of the ticket to
105 * zero so that we know to steal the initial transaction overhead from the
106 * first transaction commit.
107 */
108static struct xlog_ticket *
109xlog_cil_ticket_alloc(
110 struct log *log)
111{
112 struct xlog_ticket *tic;
113
114 tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
115 KM_SLEEP|KM_NOFS);
116 tic->t_trans_type = XFS_TRANS_CHECKPOINT;
117
118 /*
119 * set the current reservation to zero so we know to steal the basic
120 * transaction overhead reservation from the first transaction commit.
121 */
122 tic->t_curr_res = 0;
123 return tic;
124}
125
126/*
127 * After the first stage of log recovery is done, we know where the head and
128 * tail of the log are. We need this log initialisation done before we can
129 * initialise the first CIL checkpoint context.
130 *
131 * Here we allocate a log ticket to track space usage during a CIL push. This
132 * ticket is passed to xlog_write() directly so that we don't slowly leak log
133 * space by failing to account for space used by log headers and additional
134 * region headers for split regions.
135 */
136void
137xlog_cil_init_post_recovery(
138 struct log *log)
139{
140 if (!log->l_cilp)
141 return;
142
143 log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
144 log->l_cilp->xc_ctx->sequence = 1;
145 log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
146 log->l_curr_block);
147}
148
149/*
150 * Insert the log item into the CIL and calculate the difference in space
151 * consumed by the item. Add the space to the checkpoint ticket and calculate
152 * if the change requires additional log metadata. If it does, take that space
153 * as well. Remove the amount of space we addded to the checkpoint ticket from
154 * the current transaction ticket so that the accounting works out correctly.
155 *
156 * If this is the first time the item is being placed into the CIL in this
157 * context, pin it so it can't be written to disk until the CIL is flushed to
158 * the iclog and the iclog written to disk.
159 */
160static void
161xlog_cil_insert(
162 struct log *log,
163 struct xlog_ticket *ticket,
164 struct xfs_log_item *item,
165 struct xfs_log_vec *lv)
166{
167 struct xfs_cil *cil = log->l_cilp;
168 struct xfs_log_vec *old = lv->lv_item->li_lv;
169 struct xfs_cil_ctx *ctx = cil->xc_ctx;
170 int len;
171 int diff_iovecs;
172 int iclog_space;
173
174 if (old) {
175 /* existing lv on log item, space used is a delta */
176 ASSERT(!list_empty(&item->li_cil));
177 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
178
179 len = lv->lv_buf_len - old->lv_buf_len;
180 diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
181 kmem_free(old->lv_buf);
182 kmem_free(old);
183 } else {
184 /* new lv, must pin the log item */
185 ASSERT(!lv->lv_item->li_lv);
186 ASSERT(list_empty(&item->li_cil));
187
188 len = lv->lv_buf_len;
189 diff_iovecs = lv->lv_niovecs;
190 IOP_PIN(lv->lv_item);
191
192 }
193 len += diff_iovecs * sizeof(xlog_op_header_t);
194
195 /* attach new log vector to log item */
196 lv->lv_item->li_lv = lv;
197
198 spin_lock(&cil->xc_cil_lock);
199 list_move_tail(&item->li_cil, &cil->xc_cil);
200 ctx->nvecs += diff_iovecs;
201
202 /*
203 * If this is the first time the item is being committed to the CIL,
204 * store the sequence number on the log item so we can tell
205 * in future commits whether this is the first checkpoint the item is
206 * being committed into.
207 */
208 if (!item->li_seq)
209 item->li_seq = ctx->sequence;
210
211 /*
212 * Now transfer enough transaction reservation to the context ticket
213 * for the checkpoint. The context ticket is special - the unit
214 * reservation has to grow as well as the current reservation as we
215 * steal from tickets so we can correctly determine the space used
216 * during the transaction commit.
217 */
218 if (ctx->ticket->t_curr_res == 0) {
219 /* first commit in checkpoint, steal the header reservation */
220 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
221 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
222 ticket->t_curr_res -= ctx->ticket->t_unit_res;
223 }
224
225 /* do we need space for more log record headers? */
226 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
227 if (len > 0 && (ctx->space_used / iclog_space !=
228 (ctx->space_used + len) / iclog_space)) {
229 int hdrs;
230
231 hdrs = (len + iclog_space - 1) / iclog_space;
232 /* need to take into account split region headers, too */
233 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
234 ctx->ticket->t_unit_res += hdrs;
235 ctx->ticket->t_curr_res += hdrs;
236 ticket->t_curr_res -= hdrs;
237 ASSERT(ticket->t_curr_res >= len);
238 }
239 ticket->t_curr_res -= len;
240 ctx->space_used += len;
241
242 spin_unlock(&cil->xc_cil_lock);
243}
244
245/*
246 * Format log item into a flat buffers
247 *
248 * For delayed logging, we need to hold a formatted buffer containing all the
249 * changes on the log item. This enables us to relog the item in memory and
250 * write it out asynchronously without needing to relock the object that was
251 * modified at the time it gets written into the iclog.
252 *
253 * This function builds a vector for the changes in each log item in the
254 * transaction. It then works out the length of the buffer needed for each log
255 * item, allocates them and formats the vector for the item into the buffer.
256 * The buffer is then attached to the log item are then inserted into the
257 * Committed Item List for tracking until the next checkpoint is written out.
258 *
259 * We don't set up region headers during this process; we simply copy the
260 * regions into the flat buffer. We can do this because we still have to do a
261 * formatting step to write the regions into the iclog buffer. Writing the
262 * ophdrs during the iclog write means that we can support splitting large
263 * regions across iclog boundares without needing a change in the format of the
264 * item/region encapsulation.
265 *
266 * Hence what we need to do now is change the rewrite the vector array to point
267 * to the copied region inside the buffer we just allocated. This allows us to
268 * format the regions into the iclog as though they are being formatted
269 * directly out of the objects themselves.
270 */
271static void
272xlog_cil_format_items(
273 struct log *log,
274 struct xfs_log_vec *log_vector,
275 struct xlog_ticket *ticket,
276 xfs_lsn_t *start_lsn)
277{
278 struct xfs_log_vec *lv;
279
280 if (start_lsn)
281 *start_lsn = log->l_cilp->xc_ctx->sequence;
282
283 ASSERT(log_vector);
284 for (lv = log_vector; lv; lv = lv->lv_next) {
285 void *ptr;
286 int index;
287 int len = 0;
288
289 /* build the vector array and calculate it's length */
290 IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
291 for (index = 0; index < lv->lv_niovecs; index++)
292 len += lv->lv_iovecp[index].i_len;
293
294 lv->lv_buf_len = len;
295 lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
296 ptr = lv->lv_buf;
297
298 for (index = 0; index < lv->lv_niovecs; index++) {
299 struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
300
301 memcpy(ptr, vec->i_addr, vec->i_len);
302 vec->i_addr = ptr;
303 ptr += vec->i_len;
304 }
305 ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
306
307 xlog_cil_insert(log, ticket, lv->lv_item, lv);
308 }
309}
310
311static void
312xlog_cil_free_logvec(
313 struct xfs_log_vec *log_vector)
314{
315 struct xfs_log_vec *lv;
316
317 for (lv = log_vector; lv; ) {
318 struct xfs_log_vec *next = lv->lv_next;
319 kmem_free(lv->lv_buf);
320 kmem_free(lv);
321 lv = next;
322 }
323}
324
325/*
326 * Commit a transaction with the given vector to the Committed Item List.
327 *
328 * To do this, we need to format the item, pin it in memory if required and
329 * account for the space used by the transaction. Once we have done that we
330 * need to release the unused reservation for the transaction, attach the
331 * transaction to the checkpoint context so we carry the busy extents through
332 * to checkpoint completion, and then unlock all the items in the transaction.
333 *
334 * For more specific information about the order of operations in
335 * xfs_log_commit_cil() please refer to the comments in
336 * xfs_trans_commit_iclog().
337 *
338 * Called with the context lock already held in read mode to lock out
339 * background commit, returns without it held once background commits are
340 * allowed again.
341 */
342int
343xfs_log_commit_cil(
344 struct xfs_mount *mp,
345 struct xfs_trans *tp,
346 struct xfs_log_vec *log_vector,
347 xfs_lsn_t *commit_lsn,
348 int flags)
349{
350 struct log *log = mp->m_log;
351 int log_flags = 0;
352 int push = 0;
353
354 if (flags & XFS_TRANS_RELEASE_LOG_RES)
355 log_flags = XFS_LOG_REL_PERM_RESERV;
356
357 if (XLOG_FORCED_SHUTDOWN(log)) {
358 xlog_cil_free_logvec(log_vector);
359 return XFS_ERROR(EIO);
360 }
361
362 /* lock out background commit */
363 down_read(&log->l_cilp->xc_ctx_lock);
364 xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
365
366 /* check we didn't blow the reservation */
367 if (tp->t_ticket->t_curr_res < 0)
368 xlog_print_tic_res(log->l_mp, tp->t_ticket);
369
370 /* attach the transaction to the CIL if it has any busy extents */
371 if (!list_empty(&tp->t_busy)) {
372 spin_lock(&log->l_cilp->xc_cil_lock);
373 list_splice_init(&tp->t_busy,
374 &log->l_cilp->xc_ctx->busy_extents);
375 spin_unlock(&log->l_cilp->xc_cil_lock);
376 }
377
378 tp->t_commit_lsn = *commit_lsn;
379 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
380 xfs_trans_unreserve_and_mod_sb(tp);
381
382 /* check for background commit before unlock */
383 if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
384 push = 1;
385 up_read(&log->l_cilp->xc_ctx_lock);
386
387 /*
388 * We need to push CIL every so often so we don't cache more than we
389 * can fit in the log. The limit really is that a checkpoint can't be
390 * more than half the log (the current checkpoint is not allowed to
391 * overwrite the previous checkpoint), but commit latency and memory
392 * usage limit this to a smaller size in most cases.
393 */
394 if (push)
395 xlog_cil_push(log, 0);
396 return 0;
397}
398
399/*
400 * Mark all items committed and clear busy extents. We free the log vector
401 * chains in a separate pass so that we unpin the log items as quickly as
402 * possible.
403 */
404static void
405xlog_cil_committed(
406 void *args,
407 int abort)
408{
409 struct xfs_cil_ctx *ctx = args;
410 struct xfs_log_vec *lv;
411 int abortflag = abort ? XFS_LI_ABORTED : 0;
412 struct xfs_busy_extent *busyp, *n;
413
414 /* unpin all the log items */
415 for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
416 xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
417 abortflag);
418 }
419
420 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
421 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
422
423 spin_lock(&ctx->cil->xc_cil_lock);
424 list_del(&ctx->committing);
425 spin_unlock(&ctx->cil->xc_cil_lock);
426
427 xlog_cil_free_logvec(ctx->lv_chain);
428 kmem_free(ctx);
429}
430
431/*
432 * Push the Committed Item List to the log. If the push_now flag is not set,
433 * then it is a background flush and so we can chose to ignore it.
434 */
435int
436xlog_cil_push(
437 struct log *log,
438 int push_now)
439{
440 struct xfs_cil *cil = log->l_cilp;
441 struct xfs_log_vec *lv;
442 struct xfs_cil_ctx *ctx;
443 struct xfs_cil_ctx *new_ctx;
444 struct xlog_in_core *commit_iclog;
445 struct xlog_ticket *tic;
446 int num_lv;
447 int num_iovecs;
448 int len;
449 int error = 0;
450 struct xfs_trans_header thdr;
451 struct xfs_log_iovec lhdr;
452 struct xfs_log_vec lvhdr = { NULL };
453 xfs_lsn_t commit_lsn;
454
455 if (!cil)
456 return 0;
457
458 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
459 new_ctx->ticket = xlog_cil_ticket_alloc(log);
460
461 /* lock out transaction commit, but don't block on background push */
462 if (!down_write_trylock(&cil->xc_ctx_lock)) {
463 if (!push_now)
464 goto out_free_ticket;
465 down_write(&cil->xc_ctx_lock);
466 }
467 ctx = cil->xc_ctx;
468
469 /* check if we've anything to push */
470 if (list_empty(&cil->xc_cil))
471 goto out_skip;
472
473 /* check for spurious background flush */
474 if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
475 goto out_skip;
476
477 /*
478 * pull all the log vectors off the items in the CIL, and
479 * remove the items from the CIL. We don't need the CIL lock
480 * here because it's only needed on the transaction commit
481 * side which is currently locked out by the flush lock.
482 */
483 lv = NULL;
484 num_lv = 0;
485 num_iovecs = 0;
486 len = 0;
487 while (!list_empty(&cil->xc_cil)) {
488 struct xfs_log_item *item;
489 int i;
490
491 item = list_first_entry(&cil->xc_cil,
492 struct xfs_log_item, li_cil);
493 list_del_init(&item->li_cil);
494 if (!ctx->lv_chain)
495 ctx->lv_chain = item->li_lv;
496 else
497 lv->lv_next = item->li_lv;
498 lv = item->li_lv;
499 item->li_lv = NULL;
500
501 num_lv++;
502 num_iovecs += lv->lv_niovecs;
503 for (i = 0; i < lv->lv_niovecs; i++)
504 len += lv->lv_iovecp[i].i_len;
505 }
506
507 /*
508 * initialise the new context and attach it to the CIL. Then attach
509 * the current context to the CIL committing lsit so it can be found
510 * during log forces to extract the commit lsn of the sequence that
511 * needs to be forced.
512 */
513 INIT_LIST_HEAD(&new_ctx->committing);
514 INIT_LIST_HEAD(&new_ctx->busy_extents);
515 new_ctx->sequence = ctx->sequence + 1;
516 new_ctx->cil = cil;
517 cil->xc_ctx = new_ctx;
518
519 /*
520 * The switch is now done, so we can drop the context lock and move out
521 * of a shared context. We can't just go straight to the commit record,
522 * though - we need to synchronise with previous and future commits so
523 * that the commit records are correctly ordered in the log to ensure
524 * that we process items during log IO completion in the correct order.
525 *
526 * For example, if we get an EFI in one checkpoint and the EFD in the
527 * next (e.g. due to log forces), we do not want the checkpoint with
528 * the EFD to be committed before the checkpoint with the EFI. Hence
529 * we must strictly order the commit records of the checkpoints so
530 * that: a) the checkpoint callbacks are attached to the iclogs in the
531 * correct order; and b) the checkpoints are replayed in correct order
532 * in log recovery.
533 *
534 * Hence we need to add this context to the committing context list so
535 * that higher sequences will wait for us to write out a commit record
536 * before they do.
537 */
538 spin_lock(&cil->xc_cil_lock);
539 list_add(&ctx->committing, &cil->xc_committing);
540 spin_unlock(&cil->xc_cil_lock);
541 up_write(&cil->xc_ctx_lock);
542
543 /*
544 * Build a checkpoint transaction header and write it to the log to
545 * begin the transaction. We need to account for the space used by the
546 * transaction header here as it is not accounted for in xlog_write().
547 *
548 * The LSN we need to pass to the log items on transaction commit is
549 * the LSN reported by the first log vector write. If we use the commit
550 * record lsn then we can move the tail beyond the grant write head.
551 */
552 tic = ctx->ticket;
553 thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
554 thdr.th_type = XFS_TRANS_CHECKPOINT;
555 thdr.th_tid = tic->t_tid;
556 thdr.th_num_items = num_iovecs;
557 lhdr.i_addr = (xfs_caddr_t)&thdr;
558 lhdr.i_len = sizeof(xfs_trans_header_t);
559 lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
560 tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
561
562 lvhdr.lv_niovecs = 1;
563 lvhdr.lv_iovecp = &lhdr;
564 lvhdr.lv_next = ctx->lv_chain;
565
566 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
567 if (error)
568 goto out_abort;
569
570 /*
571 * now that we've written the checkpoint into the log, strictly
572 * order the commit records so replay will get them in the right order.
573 */
574restart:
575 spin_lock(&cil->xc_cil_lock);
576 list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
577 /*
578 * Higher sequences will wait for this one so skip them.
579 * Don't wait for own own sequence, either.
580 */
581 if (new_ctx->sequence >= ctx->sequence)
582 continue;
583 if (!new_ctx->commit_lsn) {
584 /*
585 * It is still being pushed! Wait for the push to
586 * complete, then start again from the beginning.
587 */
588 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
589 goto restart;
590 }
591 }
592 spin_unlock(&cil->xc_cil_lock);
593
594 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
595 if (error || commit_lsn == -1)
596 goto out_abort;
597
598 /* attach all the transactions w/ busy extents to iclog */
599 ctx->log_cb.cb_func = xlog_cil_committed;
600 ctx->log_cb.cb_arg = ctx;
601 error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
602 if (error)
603 goto out_abort;
604
605 /*
606 * now the checkpoint commit is complete and we've attached the
607 * callbacks to the iclog we can assign the commit LSN to the context
608 * and wake up anyone who is waiting for the commit to complete.
609 */
610 spin_lock(&cil->xc_cil_lock);
611 ctx->commit_lsn = commit_lsn;
612 sv_broadcast(&cil->xc_commit_wait);
613 spin_unlock(&cil->xc_cil_lock);
614
615 /* release the hounds! */
616 return xfs_log_release_iclog(log->l_mp, commit_iclog);
617
618out_skip:
619 up_write(&cil->xc_ctx_lock);
620out_free_ticket:
621 xfs_log_ticket_put(new_ctx->ticket);
622 kmem_free(new_ctx);
623 return 0;
624
625out_abort:
626 xlog_cil_committed(ctx, XFS_LI_ABORTED);
627 return XFS_ERROR(EIO);
628}
629
630/*
631 * Conditionally push the CIL based on the sequence passed in.
632 *
633 * We only need to push if we haven't already pushed the sequence
634 * number given. Hence the only time we will trigger a push here is
635 * if the push sequence is the same as the current context.
636 *
637 * We return the current commit lsn to allow the callers to determine if a
638 * iclog flush is necessary following this call.
639 *
640 * XXX: Initially, just push the CIL unconditionally and return whatever
641 * commit lsn is there. It'll be empty, so this is broken for now.
642 */
643xfs_lsn_t
644xlog_cil_push_lsn(
645 struct log *log,
646 xfs_lsn_t push_seq)
647{
648 struct xfs_cil *cil = log->l_cilp;
649 struct xfs_cil_ctx *ctx;
650 xfs_lsn_t commit_lsn = NULLCOMMITLSN;
651
652restart:
653 down_write(&cil->xc_ctx_lock);
654 ASSERT(push_seq <= cil->xc_ctx->sequence);
655
656 /* check to see if we need to force out the current context */
657 if (push_seq == cil->xc_ctx->sequence) {
658 up_write(&cil->xc_ctx_lock);
659 xlog_cil_push(log, 1);
660 goto restart;
661 }
662
663 /*
664 * See if we can find a previous sequence still committing.
665 * We can drop the flush lock as soon as we have the cil lock
666 * because we are now only comparing contexts protected by
667 * the cil lock.
668 *
669 * We need to wait for all previous sequence commits to complete
670 * before allowing the force of push_seq to go ahead. Hence block
671 * on commits for those as well.
672 */
673 spin_lock(&cil->xc_cil_lock);
674 up_write(&cil->xc_ctx_lock);
675 list_for_each_entry(ctx, &cil->xc_committing, committing) {
676 if (ctx->sequence > push_seq)
677 continue;
678 if (!ctx->commit_lsn) {
679 /*
680 * It is still being pushed! Wait for the push to
681 * complete, then start again from the beginning.
682 */
683 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
684 goto restart;
685 }
686 if (ctx->sequence != push_seq)
687 continue;
688 /* found it! */
689 commit_lsn = ctx->commit_lsn;
690 }
691 spin_unlock(&cil->xc_cil_lock);
692 return commit_lsn;
693}
694
695/*
696 * Check if the current log item was first committed in this sequence.
697 * We can't rely on just the log item being in the CIL, we have to check
698 * the recorded commit sequence number.
699 *
700 * Note: for this to be used in a non-racy manner, it has to be called with
701 * CIL flushing locked out. As a result, it should only be used during the
702 * transaction commit process when deciding what to format into the item.
703 */
704bool
705xfs_log_item_in_current_chkpt(
706 struct xfs_log_item *lip)
707{
708 struct xfs_cil_ctx *ctx;
709
710 if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
711 return false;
712 if (list_empty(&lip->li_cil))
713 return false;
714
715 ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
716
717 /*
718 * li_seq is written on the first commit of a log item to record the
719 * first checkpoint it is written to. Hence if it is different to the
720 * current sequence, we're in a new checkpoint.
721 */
722 if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
723 return false;
724 return true;
725}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 9cf695154451..8c072618965c 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i)
152#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ 152#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
153#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being 153#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
154 shutdown */ 154 shutdown */
155typedef __uint32_t xlog_tid_t;
156
157 155
158#ifdef __KERNEL__ 156#ifdef __KERNEL__
159/* 157/*
@@ -379,6 +377,99 @@ typedef struct xlog_in_core {
379} xlog_in_core_t; 377} xlog_in_core_t;
380 378
381/* 379/*
380 * The CIL context is used to aggregate per-transaction details as well be
381 * passed to the iclog for checkpoint post-commit processing. After being
382 * passed to the iclog, another context needs to be allocated for tracking the
383 * next set of transactions to be aggregated into a checkpoint.
384 */
385struct xfs_cil;
386
387struct xfs_cil_ctx {
388 struct xfs_cil *cil;
389 xfs_lsn_t sequence; /* chkpt sequence # */
390 xfs_lsn_t start_lsn; /* first LSN of chkpt commit */
391 xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
392 struct xlog_ticket *ticket; /* chkpt ticket */
393 int nvecs; /* number of regions */
394 int space_used; /* aggregate size of regions */
395 struct list_head busy_extents; /* busy extents in chkpt */
396 struct xfs_log_vec *lv_chain; /* logvecs being pushed */
397 xfs_log_callback_t log_cb; /* completion callback hook. */
398 struct list_head committing; /* ctx committing list */
399};
400
401/*
402 * Committed Item List structure
403 *
404 * This structure is used to track log items that have been committed but not
405 * yet written into the log. It is used only when the delayed logging mount
406 * option is enabled.
407 *
408 * This structure tracks the list of committing checkpoint contexts so
409 * we can avoid the problem of having to hold out new transactions during a
410 * flush until we have a the commit record LSN of the checkpoint. We can
411 * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
412 * sequence match and extract the commit LSN directly from there. If the
413 * checkpoint is still in the process of committing, we can block waiting for
414 * the commit LSN to be determined as well. This should make synchronous
415 * operations almost as efficient as the old logging methods.
416 */
417struct xfs_cil {
418 struct log *xc_log;
419 struct list_head xc_cil;
420 spinlock_t xc_cil_lock;
421 struct xfs_cil_ctx *xc_ctx;
422 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing;
424 sv_t xc_commit_wait;
425};
426
427/*
428 * The amount of log space we should the CIL to aggregate is difficult to size.
429 * Whatever we chose we have to make we can get a reservation for the log space
430 * effectively, that it is large enough to capture sufficient relogging to
431 * reduce log buffer IO significantly, but it is not too large for the log or
432 * induces too much latency when writing out through the iclogs. We track both
433 * space consumed and the number of vectors in the checkpoint context, so we
434 * need to decide which to use for limiting.
435 *
436 * Every log buffer we write out during a push needs a header reserved, which
437 * is at least one sector and more for v2 logs. Hence we need a reservation of
438 * at least 512 bytes per 32k of log space just for the LR headers. That means
439 * 16KB of reservation per megabyte of delayed logging space we will consume,
440 * plus various headers. The number of headers will vary based on the num of
441 * io vectors, so limiting on a specific number of vectors is going to result
442 * in transactions of varying size. IOWs, it is more consistent to track and
443 * limit space consumed in the log rather than by the number of objects being
444 * logged in order to prevent checkpoint ticket overruns.
445 *
446 * Further, use of static reservations through the log grant mechanism is
447 * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
448 * grant) and a significant deadlock potential because regranting write space
449 * can block on log pushes. Hence if we have to regrant log space during a log
450 * push, we can deadlock.
451 *
452 * However, we can avoid this by use of a dynamic "reservation stealing"
453 * technique during transaction commit whereby unused reservation space in the
454 * transaction ticket is transferred to the CIL ctx commit ticket to cover the
455 * space needed by the checkpoint transaction. This means that we never need to
456 * specifically reserve space for the CIL checkpoint transaction, nor do we
457 * need to regrant space once the checkpoint completes. This also means the
458 * checkpoint transaction ticket is specific to the checkpoint context, rather
459 * than the CIL itself.
460 *
461 * With dynamic reservations, we can basically make up arbitrary limits for the
462 * checkpoint size so long as they don't violate any other size rules. Hence
463 * the initial maximum size for the checkpoint transaction will be set to a
464 * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
465 * right now based on the latency of writing out a large amount of data through
466 * the circular iclog buffers.
467 */
468
469#define XLOG_CIL_SPACE_LIMIT(log) \
470 (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
471
472/*
382 * The reservation head lsn is not made up of a cycle number and block number. 473 * The reservation head lsn is not made up of a cycle number and block number.
383 * Instead, it uses a cycle number and byte number. Logs don't expect to 474 * Instead, it uses a cycle number and byte number. Logs don't expect to
384 * overflow 31 bits worth of byte offset, so using a byte number will mean 475 * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -388,6 +479,7 @@ typedef struct log {
388 /* The following fields don't need locking */ 479 /* The following fields don't need locking */
389 struct xfs_mount *l_mp; /* mount point */ 480 struct xfs_mount *l_mp; /* mount point */
390 struct xfs_ail *l_ailp; /* AIL log is working with */ 481 struct xfs_ail *l_ailp; /* AIL log is working with */
482 struct xfs_cil *l_cilp; /* CIL log is working with */
391 struct xfs_buf *l_xbuf; /* extra buffer for log 483 struct xfs_buf *l_xbuf; /* extra buffer for log
392 * wrapping */ 484 * wrapping */
393 struct xfs_buftarg *l_targ; /* buftarg of log */ 485 struct xfs_buftarg *l_targ; /* buftarg of log */
@@ -438,14 +530,17 @@ typedef struct log {
438 530
439#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) 531#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
440 532
441
442/* common routines */ 533/* common routines */
443extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); 534extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
444extern int xlog_recover(xlog_t *log); 535extern int xlog_recover(xlog_t *log);
445extern int xlog_recover_finish(xlog_t *log); 536extern int xlog_recover_finish(xlog_t *log);
446extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 537extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
447 538
448extern kmem_zone_t *xfs_log_ticket_zone; 539extern kmem_zone_t *xfs_log_ticket_zone;
540struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
541 int count, char client, uint xflags,
542 int alloc_flags);
543
449 544
450static inline void 545static inline void
451xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) 546xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
@@ -455,6 +550,21 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
455 *off += bytes; 550 *off += bytes;
456} 551}
457 552
553void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
554int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
555 struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
556 xlog_in_core_t **commit_iclog, uint flags);
557
558/*
559 * Committed Item List interfaces
560 */
561int xlog_cil_init(struct log *log);
562void xlog_cil_init_post_recovery(struct log *log);
563void xlog_cil_destroy(struct log *log);
564
565int xlog_cil_push(struct log *log, int push_now);
566xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
567
458/* 568/*
459 * Unmount record type is used as a pseudo transaction type for the ticket. 569 * Unmount record type is used as a pseudo transaction type for the ticket.
460 * It's value must be outside the range of XFS_TRANS_* values. 570 * It's value must be outside the range of XFS_TRANS_* values.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 0de08e366315..14a69aec2c0b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1576,7 +1576,7 @@ xlog_recover_reorder_trans(
1576 1576
1577 switch (ITEM_TYPE(item)) { 1577 switch (ITEM_TYPE(item)) {
1578 case XFS_LI_BUF: 1578 case XFS_LI_BUF:
1579 if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) { 1579 if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1580 trace_xfs_log_recover_item_reorder_head(log, 1580 trace_xfs_log_recover_item_reorder_head(log,
1581 trans, item, pass); 1581 trans, item, pass);
1582 list_move(&item->ri_list, &trans->r_itemq); 1582 list_move(&item->ri_list, &trans->r_itemq);
@@ -1638,7 +1638,7 @@ xlog_recover_do_buffer_pass1(
1638 /* 1638 /*
1639 * If this isn't a cancel buffer item, then just return. 1639 * If this isn't a cancel buffer item, then just return.
1640 */ 1640 */
1641 if (!(flags & XFS_BLI_CANCEL)) { 1641 if (!(flags & XFS_BLF_CANCEL)) {
1642 trace_xfs_log_recover_buf_not_cancel(log, buf_f); 1642 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1643 return; 1643 return;
1644 } 1644 }
@@ -1696,7 +1696,7 @@ xlog_recover_do_buffer_pass1(
1696 * Check to see whether the buffer being recovered has a corresponding 1696 * Check to see whether the buffer being recovered has a corresponding
1697 * entry in the buffer cancel record table. If it does then return 1 1697 * entry in the buffer cancel record table. If it does then return 1
1698 * so that it will be cancelled, otherwise return 0. If the buffer is 1698 * so that it will be cancelled, otherwise return 0. If the buffer is
1699 * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement 1699 * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
1700 * the refcount on the entry in the table and remove it from the table 1700 * the refcount on the entry in the table and remove it from the table
1701 * if this is the last reference. 1701 * if this is the last reference.
1702 * 1702 *
@@ -1721,7 +1721,7 @@ xlog_check_buffer_cancelled(
1721 * There is nothing in the table built in pass one, 1721 * There is nothing in the table built in pass one,
1722 * so this buffer must not be cancelled. 1722 * so this buffer must not be cancelled.
1723 */ 1723 */
1724 ASSERT(!(flags & XFS_BLI_CANCEL)); 1724 ASSERT(!(flags & XFS_BLF_CANCEL));
1725 return 0; 1725 return 0;
1726 } 1726 }
1727 1727
@@ -1733,7 +1733,7 @@ xlog_check_buffer_cancelled(
1733 * There is no corresponding entry in the table built 1733 * There is no corresponding entry in the table built
1734 * in pass one, so this buffer has not been cancelled. 1734 * in pass one, so this buffer has not been cancelled.
1735 */ 1735 */
1736 ASSERT(!(flags & XFS_BLI_CANCEL)); 1736 ASSERT(!(flags & XFS_BLF_CANCEL));
1737 return 0; 1737 return 0;
1738 } 1738 }
1739 1739
@@ -1752,7 +1752,7 @@ xlog_check_buffer_cancelled(
1752 * one in the table and remove it if this is the 1752 * one in the table and remove it if this is the
1753 * last reference. 1753 * last reference.
1754 */ 1754 */
1755 if (flags & XFS_BLI_CANCEL) { 1755 if (flags & XFS_BLF_CANCEL) {
1756 bcp->bc_refcount--; 1756 bcp->bc_refcount--;
1757 if (bcp->bc_refcount == 0) { 1757 if (bcp->bc_refcount == 0) {
1758 if (prevp == NULL) { 1758 if (prevp == NULL) {
@@ -1772,7 +1772,7 @@ xlog_check_buffer_cancelled(
1772 * We didn't find a corresponding entry in the table, so 1772 * We didn't find a corresponding entry in the table, so
1773 * return 0 so that the buffer is NOT cancelled. 1773 * return 0 so that the buffer is NOT cancelled.
1774 */ 1774 */
1775 ASSERT(!(flags & XFS_BLI_CANCEL)); 1775 ASSERT(!(flags & XFS_BLF_CANCEL));
1776 return 0; 1776 return 0;
1777} 1777}
1778 1778
@@ -1874,8 +1874,8 @@ xlog_recover_do_inode_buffer(
1874 nbits = xfs_contig_bits(data_map, map_size, 1874 nbits = xfs_contig_bits(data_map, map_size,
1875 bit); 1875 bit);
1876 ASSERT(nbits > 0); 1876 ASSERT(nbits > 0);
1877 reg_buf_offset = bit << XFS_BLI_SHIFT; 1877 reg_buf_offset = bit << XFS_BLF_SHIFT;
1878 reg_buf_bytes = nbits << XFS_BLI_SHIFT; 1878 reg_buf_bytes = nbits << XFS_BLF_SHIFT;
1879 item_index++; 1879 item_index++;
1880 } 1880 }
1881 1881
@@ -1889,7 +1889,7 @@ xlog_recover_do_inode_buffer(
1889 } 1889 }
1890 1890
1891 ASSERT(item->ri_buf[item_index].i_addr != NULL); 1891 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1892 ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0); 1892 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
1893 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); 1893 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
1894 1894
1895 /* 1895 /*
@@ -1955,9 +1955,9 @@ xlog_recover_do_reg_buffer(
1955 nbits = xfs_contig_bits(data_map, map_size, bit); 1955 nbits = xfs_contig_bits(data_map, map_size, bit);
1956 ASSERT(nbits > 0); 1956 ASSERT(nbits > 0);
1957 ASSERT(item->ri_buf[i].i_addr != NULL); 1957 ASSERT(item->ri_buf[i].i_addr != NULL);
1958 ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0); 1958 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
1959 ASSERT(XFS_BUF_COUNT(bp) >= 1959 ASSERT(XFS_BUF_COUNT(bp) >=
1960 ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT)); 1960 ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
1961 1961
1962 /* 1962 /*
1963 * Do a sanity check if this is a dquot buffer. Just checking 1963 * Do a sanity check if this is a dquot buffer. Just checking
@@ -1966,7 +1966,7 @@ xlog_recover_do_reg_buffer(
1966 */ 1966 */
1967 error = 0; 1967 error = 0;
1968 if (buf_f->blf_flags & 1968 if (buf_f->blf_flags &
1969 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { 1969 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
1970 if (item->ri_buf[i].i_addr == NULL) { 1970 if (item->ri_buf[i].i_addr == NULL) {
1971 cmn_err(CE_ALERT, 1971 cmn_err(CE_ALERT,
1972 "XFS: NULL dquot in %s.", __func__); 1972 "XFS: NULL dquot in %s.", __func__);
@@ -1987,9 +1987,9 @@ xlog_recover_do_reg_buffer(
1987 } 1987 }
1988 1988
1989 memcpy(xfs_buf_offset(bp, 1989 memcpy(xfs_buf_offset(bp,
1990 (uint)bit << XFS_BLI_SHIFT), /* dest */ 1990 (uint)bit << XFS_BLF_SHIFT), /* dest */
1991 item->ri_buf[i].i_addr, /* source */ 1991 item->ri_buf[i].i_addr, /* source */
1992 nbits<<XFS_BLI_SHIFT); /* length */ 1992 nbits<<XFS_BLF_SHIFT); /* length */
1993 next: 1993 next:
1994 i++; 1994 i++;
1995 bit += nbits; 1995 bit += nbits;
@@ -2148,11 +2148,11 @@ xlog_recover_do_dquot_buffer(
2148 } 2148 }
2149 2149
2150 type = 0; 2150 type = 0;
2151 if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF) 2151 if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2152 type |= XFS_DQ_USER; 2152 type |= XFS_DQ_USER;
2153 if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF) 2153 if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2154 type |= XFS_DQ_PROJ; 2154 type |= XFS_DQ_PROJ;
2155 if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF) 2155 if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2156 type |= XFS_DQ_GROUP; 2156 type |= XFS_DQ_GROUP;
2157 /* 2157 /*
2158 * This type of quotas was turned off, so ignore this buffer 2158 * This type of quotas was turned off, so ignore this buffer
@@ -2173,7 +2173,7 @@ xlog_recover_do_dquot_buffer(
2173 * here which overlaps that may be stale. 2173 * here which overlaps that may be stale.
2174 * 2174 *
2175 * When meta-data buffers are freed at run time we log a buffer item 2175 * When meta-data buffers are freed at run time we log a buffer item
2176 * with the XFS_BLI_CANCEL bit set to indicate that previous copies 2176 * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2177 * of the buffer in the log should not be replayed at recovery time. 2177 * of the buffer in the log should not be replayed at recovery time.
2178 * This is so that if the blocks covered by the buffer are reused for 2178 * This is so that if the blocks covered by the buffer are reused for
2179 * file data before we crash we don't end up replaying old, freed 2179 * file data before we crash we don't end up replaying old, freed
@@ -2207,7 +2207,7 @@ xlog_recover_do_buffer_trans(
2207 if (pass == XLOG_RECOVER_PASS1) { 2207 if (pass == XLOG_RECOVER_PASS1) {
2208 /* 2208 /*
2209 * In this pass we're only looking for buf items 2209 * In this pass we're only looking for buf items
2210 * with the XFS_BLI_CANCEL bit set. 2210 * with the XFS_BLF_CANCEL bit set.
2211 */ 2211 */
2212 xlog_recover_do_buffer_pass1(log, buf_f); 2212 xlog_recover_do_buffer_pass1(log, buf_f);
2213 return 0; 2213 return 0;
@@ -2244,7 +2244,7 @@ xlog_recover_do_buffer_trans(
2244 2244
2245 mp = log->l_mp; 2245 mp = log->l_mp;
2246 buf_flags = XBF_LOCK; 2246 buf_flags = XBF_LOCK;
2247 if (!(flags & XFS_BLI_INODE_BUF)) 2247 if (!(flags & XFS_BLF_INODE_BUF))
2248 buf_flags |= XBF_MAPPED; 2248 buf_flags |= XBF_MAPPED;
2249 2249
2250 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); 2250 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
@@ -2257,10 +2257,10 @@ xlog_recover_do_buffer_trans(
2257 } 2257 }
2258 2258
2259 error = 0; 2259 error = 0;
2260 if (flags & XFS_BLI_INODE_BUF) { 2260 if (flags & XFS_BLF_INODE_BUF) {
2261 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2261 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2262 } else if (flags & 2262 } else if (flags &
2263 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { 2263 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2264 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 2264 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2265 } else { 2265 } else {
2266 xlog_recover_do_reg_buffer(mp, item, bp, buf_f); 2266 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index 75d749207258..1c55ccbb379d 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -28,7 +28,7 @@
28#define XLOG_RHASH(tid) \ 28#define XLOG_RHASH(tid) \
29 ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) 29 ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
30 30
31#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1) 31#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
32 32
33 33
34/* 34/*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9ff48a16a7ee..1d2c7eed4eda 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -268,6 +268,7 @@ typedef struct xfs_mount {
268#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops 268#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
269 must be synchronous except 269 must be synchronous except
270 for space allocations */ 270 for space allocations */
271#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */
271#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ 272#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */
272#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) 273#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
273#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem 274#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index be578ecb4af2..ce558efa2ea0 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -44,6 +44,7 @@
44#include "xfs_trans_priv.h" 44#include "xfs_trans_priv.h"
45#include "xfs_trans_space.h" 45#include "xfs_trans_space.h"
46#include "xfs_inode_item.h" 46#include "xfs_inode_item.h"
47#include "xfs_trace.h"
47 48
48kmem_zone_t *xfs_trans_zone; 49kmem_zone_t *xfs_trans_zone;
49 50
@@ -243,9 +244,8 @@ _xfs_trans_alloc(
243 tp->t_type = type; 244 tp->t_type = type;
244 tp->t_mountp = mp; 245 tp->t_mountp = mp;
245 tp->t_items_free = XFS_LIC_NUM_SLOTS; 246 tp->t_items_free = XFS_LIC_NUM_SLOTS;
246 tp->t_busy_free = XFS_LBC_NUM_SLOTS;
247 xfs_lic_init(&(tp->t_items)); 247 xfs_lic_init(&(tp->t_items));
248 XFS_LBC_INIT(&(tp->t_busy)); 248 INIT_LIST_HEAD(&tp->t_busy);
249 return tp; 249 return tp;
250} 250}
251 251
@@ -255,8 +255,13 @@ _xfs_trans_alloc(
255 */ 255 */
256STATIC void 256STATIC void
257xfs_trans_free( 257xfs_trans_free(
258 xfs_trans_t *tp) 258 struct xfs_trans *tp)
259{ 259{
260 struct xfs_busy_extent *busyp, *n;
261
262 list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
263 xfs_alloc_busy_clear(tp->t_mountp, busyp);
264
260 atomic_dec(&tp->t_mountp->m_active_trans); 265 atomic_dec(&tp->t_mountp->m_active_trans);
261 xfs_trans_free_dqinfo(tp); 266 xfs_trans_free_dqinfo(tp);
262 kmem_zone_free(xfs_trans_zone, tp); 267 kmem_zone_free(xfs_trans_zone, tp);
@@ -285,9 +290,8 @@ xfs_trans_dup(
285 ntp->t_type = tp->t_type; 290 ntp->t_type = tp->t_type;
286 ntp->t_mountp = tp->t_mountp; 291 ntp->t_mountp = tp->t_mountp;
287 ntp->t_items_free = XFS_LIC_NUM_SLOTS; 292 ntp->t_items_free = XFS_LIC_NUM_SLOTS;
288 ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
289 xfs_lic_init(&(ntp->t_items)); 293 xfs_lic_init(&(ntp->t_items));
290 XFS_LBC_INIT(&(ntp->t_busy)); 294 INIT_LIST_HEAD(&ntp->t_busy);
291 295
292 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 296 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
293 ASSERT(tp->t_ticket != NULL); 297 ASSERT(tp->t_ticket != NULL);
@@ -423,7 +427,6 @@ undo_blocks:
423 return error; 427 return error;
424} 428}
425 429
426
427/* 430/*
428 * Record the indicated change to the given field for application 431 * Record the indicated change to the given field for application
429 * to the file system's superblock when the transaction commits. 432 * to the file system's superblock when the transaction commits.
@@ -652,7 +655,7 @@ xfs_trans_apply_sb_deltas(
652 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we 655 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
653 * still need to update the incore superblock with the changes. 656 * still need to update the incore superblock with the changes.
654 */ 657 */
655STATIC void 658void
656xfs_trans_unreserve_and_mod_sb( 659xfs_trans_unreserve_and_mod_sb(
657 xfs_trans_t *tp) 660 xfs_trans_t *tp)
658{ 661{
@@ -880,7 +883,7 @@ xfs_trans_fill_vecs(
880 * they could be immediately flushed and we'd have to race with the flusher 883 * they could be immediately flushed and we'd have to race with the flusher
881 * trying to pull the item from the AIL as we add it. 884 * trying to pull the item from the AIL as we add it.
882 */ 885 */
883static void 886void
884xfs_trans_item_committed( 887xfs_trans_item_committed(
885 struct xfs_log_item *lip, 888 struct xfs_log_item *lip,
886 xfs_lsn_t commit_lsn, 889 xfs_lsn_t commit_lsn,
@@ -930,26 +933,6 @@ xfs_trans_item_committed(
930 IOP_UNPIN(lip); 933 IOP_UNPIN(lip);
931} 934}
932 935
933/* Clear all the per-AG busy list items listed in this transaction */
934static void
935xfs_trans_clear_busy_extents(
936 struct xfs_trans *tp)
937{
938 xfs_log_busy_chunk_t *lbcp;
939 xfs_log_busy_slot_t *lbsp;
940 int i;
941
942 for (lbcp = &tp->t_busy; lbcp != NULL; lbcp = lbcp->lbc_next) {
943 i = 0;
944 for (lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
945 if (XFS_LBC_ISFREE(lbcp, i))
946 continue;
947 xfs_alloc_clear_busy(tp, lbsp->lbc_ag, lbsp->lbc_idx);
948 }
949 }
950 xfs_trans_free_busy(tp);
951}
952
953/* 936/*
954 * This is typically called by the LM when a transaction has been fully 937 * This is typically called by the LM when a transaction has been fully
955 * committed to disk. It needs to unpin the items which have 938 * committed to disk. It needs to unpin the items which have
@@ -984,7 +967,6 @@ xfs_trans_committed(
984 kmem_free(licp); 967 kmem_free(licp);
985 } 968 }
986 969
987 xfs_trans_clear_busy_extents(tp);
988 xfs_trans_free(tp); 970 xfs_trans_free(tp);
989} 971}
990 972
@@ -1012,8 +994,7 @@ xfs_trans_uncommit(
1012 xfs_trans_unreserve_and_mod_sb(tp); 994 xfs_trans_unreserve_and_mod_sb(tp);
1013 xfs_trans_unreserve_and_mod_dquots(tp); 995 xfs_trans_unreserve_and_mod_dquots(tp);
1014 996
1015 xfs_trans_free_items(tp, flags); 997 xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
1016 xfs_trans_free_busy(tp);
1017 xfs_trans_free(tp); 998 xfs_trans_free(tp);
1018} 999}
1019 1000
@@ -1075,6 +1056,8 @@ xfs_trans_commit_iclog(
1075 *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); 1056 *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
1076 1057
1077 tp->t_commit_lsn = *commit_lsn; 1058 tp->t_commit_lsn = *commit_lsn;
1059 trace_xfs_trans_commit_lsn(tp);
1060
1078 if (nvec > XFS_TRANS_LOGVEC_COUNT) 1061 if (nvec > XFS_TRANS_LOGVEC_COUNT)
1079 kmem_free(log_vector); 1062 kmem_free(log_vector);
1080 1063
@@ -1161,6 +1144,93 @@ xfs_trans_commit_iclog(
1161 return xfs_log_release_iclog(mp, commit_iclog); 1144 return xfs_log_release_iclog(mp, commit_iclog);
1162} 1145}
1163 1146
1147/*
1148 * Walk the log items and allocate log vector structures for
1149 * each item large enough to fit all the vectors they require.
1150 * Note that this format differs from the old log vector format in
1151 * that there is no transaction header in these log vectors.
1152 */
1153STATIC struct xfs_log_vec *
1154xfs_trans_alloc_log_vecs(
1155 xfs_trans_t *tp)
1156{
1157 xfs_log_item_desc_t *lidp;
1158 struct xfs_log_vec *lv = NULL;
1159 struct xfs_log_vec *ret_lv = NULL;
1160
1161 lidp = xfs_trans_first_item(tp);
1162
1163 /* Bail out if we didn't find a log item. */
1164 if (!lidp) {
1165 ASSERT(0);
1166 return NULL;
1167 }
1168
1169 while (lidp != NULL) {
1170 struct xfs_log_vec *new_lv;
1171
1172 /* Skip items which aren't dirty in this transaction. */
1173 if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
1174 lidp = xfs_trans_next_item(tp, lidp);
1175 continue;
1176 }
1177
1178 /* Skip items that do not have any vectors for writing */
1179 lidp->lid_size = IOP_SIZE(lidp->lid_item);
1180 if (!lidp->lid_size) {
1181 lidp = xfs_trans_next_item(tp, lidp);
1182 continue;
1183 }
1184
1185 new_lv = kmem_zalloc(sizeof(*new_lv) +
1186 lidp->lid_size * sizeof(struct xfs_log_iovec),
1187 KM_SLEEP);
1188
1189 /* The allocated iovec region lies beyond the log vector. */
1190 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
1191 new_lv->lv_niovecs = lidp->lid_size;
1192 new_lv->lv_item = lidp->lid_item;
1193 if (!ret_lv)
1194 ret_lv = new_lv;
1195 else
1196 lv->lv_next = new_lv;
1197 lv = new_lv;
1198 lidp = xfs_trans_next_item(tp, lidp);
1199 }
1200
1201 return ret_lv;
1202}
1203
1204static int
1205xfs_trans_commit_cil(
1206 struct xfs_mount *mp,
1207 struct xfs_trans *tp,
1208 xfs_lsn_t *commit_lsn,
1209 int flags)
1210{
1211 struct xfs_log_vec *log_vector;
1212 int error;
1213
1214 /*
1215 * Get each log item to allocate a vector structure for
1216 * the log item to to pass to the log write code. The
1217 * CIL commit code will format the vector and save it away.
1218 */
1219 log_vector = xfs_trans_alloc_log_vecs(tp);
1220 if (!log_vector)
1221 return ENOMEM;
1222
1223 error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
1224 if (error)
1225 return error;
1226
1227 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1228
1229 /* xfs_trans_free_items() unlocks them first */
1230 xfs_trans_free_items(tp, *commit_lsn, 0);
1231 xfs_trans_free(tp);
1232 return 0;
1233}
1164 1234
1165/* 1235/*
1166 * xfs_trans_commit 1236 * xfs_trans_commit
@@ -1221,7 +1291,11 @@ _xfs_trans_commit(
1221 xfs_trans_apply_sb_deltas(tp); 1291 xfs_trans_apply_sb_deltas(tp);
1222 xfs_trans_apply_dquot_deltas(tp); 1292 xfs_trans_apply_dquot_deltas(tp);
1223 1293
1224 error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); 1294 if (mp->m_flags & XFS_MOUNT_DELAYLOG)
1295 error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags);
1296 else
1297 error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
1298
1225 if (error == ENOMEM) { 1299 if (error == ENOMEM) {
1226 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 1300 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
1227 error = XFS_ERROR(EIO); 1301 error = XFS_ERROR(EIO);
@@ -1259,8 +1333,7 @@ out_unreserve:
1259 error = XFS_ERROR(EIO); 1333 error = XFS_ERROR(EIO);
1260 } 1334 }
1261 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1335 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1262 xfs_trans_free_items(tp, error ? XFS_TRANS_ABORT : 0); 1336 xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
1263 xfs_trans_free_busy(tp);
1264 xfs_trans_free(tp); 1337 xfs_trans_free(tp);
1265 1338
1266 XFS_STATS_INC(xs_trans_empty); 1339 XFS_STATS_INC(xs_trans_empty);
@@ -1338,8 +1411,7 @@ xfs_trans_cancel(
1338 /* mark this thread as no longer being in a transaction */ 1411 /* mark this thread as no longer being in a transaction */
1339 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1412 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1340 1413
1341 xfs_trans_free_items(tp, flags); 1414 xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
1342 xfs_trans_free_busy(tp);
1343 xfs_trans_free(tp); 1415 xfs_trans_free(tp);
1344} 1416}
1345 1417
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c62beee0921e..8c69e7824f68 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -106,7 +106,8 @@ typedef struct xfs_trans_header {
106#define XFS_TRANS_GROWFSRT_FREE 39 106#define XFS_TRANS_GROWFSRT_FREE 39
107#define XFS_TRANS_SWAPEXT 40 107#define XFS_TRANS_SWAPEXT 40
108#define XFS_TRANS_SB_COUNT 41 108#define XFS_TRANS_SB_COUNT 41
109#define XFS_TRANS_TYPE_MAX 41 109#define XFS_TRANS_CHECKPOINT 42
110#define XFS_TRANS_TYPE_MAX 42
110/* new transaction types need to be reflected in xfs_logprint(8) */ 111/* new transaction types need to be reflected in xfs_logprint(8) */
111 112
112#define XFS_TRANS_TYPES \ 113#define XFS_TRANS_TYPES \
@@ -148,6 +149,7 @@ typedef struct xfs_trans_header {
148 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ 149 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
149 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ 150 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
150 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ 151 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
152 { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
151 { XFS_TRANS_DUMMY1, "DUMMY1" }, \ 153 { XFS_TRANS_DUMMY1, "DUMMY1" }, \
152 { XFS_TRANS_DUMMY2, "DUMMY2" }, \ 154 { XFS_TRANS_DUMMY2, "DUMMY2" }, \
153 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } 155 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
@@ -813,6 +815,7 @@ struct xfs_log_item_desc;
813struct xfs_mount; 815struct xfs_mount;
814struct xfs_trans; 816struct xfs_trans;
815struct xfs_dquot_acct; 817struct xfs_dquot_acct;
818struct xfs_busy_extent;
816 819
817typedef struct xfs_log_item { 820typedef struct xfs_log_item {
818 struct list_head li_ail; /* AIL pointers */ 821 struct list_head li_ail; /* AIL pointers */
@@ -828,6 +831,11 @@ typedef struct xfs_log_item {
828 /* buffer item iodone */ 831 /* buffer item iodone */
829 /* callback func */ 832 /* callback func */
830 struct xfs_item_ops *li_ops; /* function list */ 833 struct xfs_item_ops *li_ops; /* function list */
834
835 /* delayed logging */
836 struct list_head li_cil; /* CIL pointers */
837 struct xfs_log_vec *li_lv; /* active log vector */
838 xfs_lsn_t li_seq; /* CIL commit seq */
831} xfs_log_item_t; 839} xfs_log_item_t;
832 840
833#define XFS_LI_IN_AIL 0x1 841#define XFS_LI_IN_AIL 0x1
@@ -872,34 +880,6 @@ typedef struct xfs_item_ops {
872#define XFS_ITEM_PUSHBUF 3 880#define XFS_ITEM_PUSHBUF 3
873 881
874/* 882/*
875 * This structure is used to maintain a list of block ranges that have been
876 * freed in the transaction. The ranges are listed in the perag[] busy list
877 * between when they're freed and the transaction is committed to disk.
878 */
879
880typedef struct xfs_log_busy_slot {
881 xfs_agnumber_t lbc_ag;
882 ushort lbc_idx; /* index in perag.busy[] */
883} xfs_log_busy_slot_t;
884
885#define XFS_LBC_NUM_SLOTS 31
886typedef struct xfs_log_busy_chunk {
887 struct xfs_log_busy_chunk *lbc_next;
888 uint lbc_free; /* free slots bitmask */
889 ushort lbc_unused; /* first unused */
890 xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS];
891} xfs_log_busy_chunk_t;
892
893#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1)
894#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1)
895
896#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK)
897#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
898#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)]))
899#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK)
900#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
901
902/*
903 * This is the type of function which can be given to xfs_trans_callback() 883 * This is the type of function which can be given to xfs_trans_callback()
904 * to be called upon the transaction's commit to disk. 884 * to be called upon the transaction's commit to disk.
905 */ 885 */
@@ -950,8 +930,7 @@ typedef struct xfs_trans {
950 unsigned int t_items_free; /* log item descs free */ 930 unsigned int t_items_free; /* log item descs free */
951 xfs_log_item_chunk_t t_items; /* first log item desc chunk */ 931 xfs_log_item_chunk_t t_items; /* first log item desc chunk */
952 xfs_trans_header_t t_header; /* header for in-log trans */ 932 xfs_trans_header_t t_header; /* header for in-log trans */
953 unsigned int t_busy_free; /* busy descs free */ 933 struct list_head t_busy; /* list of busy extents */
954 xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */
955 unsigned long t_pflags; /* saved process flags state */ 934 unsigned long t_pflags; /* saved process flags state */
956} xfs_trans_t; 935} xfs_trans_t;
957 936
@@ -1025,9 +1004,6 @@ int _xfs_trans_commit(xfs_trans_t *,
1025void xfs_trans_cancel(xfs_trans_t *, int); 1004void xfs_trans_cancel(xfs_trans_t *, int);
1026int xfs_trans_ail_init(struct xfs_mount *); 1005int xfs_trans_ail_init(struct xfs_mount *);
1027void xfs_trans_ail_destroy(struct xfs_mount *); 1006void xfs_trans_ail_destroy(struct xfs_mount *);
1028xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
1029 xfs_agnumber_t ag,
1030 xfs_extlen_t idx);
1031 1007
1032extern kmem_zone_t *xfs_trans_zone; 1008extern kmem_zone_t *xfs_trans_zone;
1033 1009
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 9cd809025f3a..63d81a22f4fd 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -114,7 +114,7 @@ _xfs_trans_bjoin(
114 xfs_buf_item_init(bp, tp->t_mountp); 114 xfs_buf_item_init(bp, tp->t_mountp);
115 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 115 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
116 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 116 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
117 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 117 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
118 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); 118 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
119 if (reset_recur) 119 if (reset_recur)
120 bip->bli_recur = 0; 120 bip->bli_recur = 0;
@@ -511,7 +511,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
511 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 511 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
512 ASSERT(bip->bli_item.li_type == XFS_LI_BUF); 512 ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
513 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 513 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
514 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 514 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
515 ASSERT(atomic_read(&bip->bli_refcount) > 0); 515 ASSERT(atomic_read(&bip->bli_refcount) > 0);
516 516
517 /* 517 /*
@@ -619,7 +619,7 @@ xfs_trans_bhold(xfs_trans_t *tp,
619 619
620 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 620 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
621 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 621 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
622 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 622 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
623 ASSERT(atomic_read(&bip->bli_refcount) > 0); 623 ASSERT(atomic_read(&bip->bli_refcount) > 0);
624 bip->bli_flags |= XFS_BLI_HOLD; 624 bip->bli_flags |= XFS_BLI_HOLD;
625 trace_xfs_trans_bhold(bip); 625 trace_xfs_trans_bhold(bip);
@@ -641,7 +641,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
641 641
642 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 642 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
643 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 643 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
644 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 644 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
645 ASSERT(atomic_read(&bip->bli_refcount) > 0); 645 ASSERT(atomic_read(&bip->bli_refcount) > 0);
646 ASSERT(bip->bli_flags & XFS_BLI_HOLD); 646 ASSERT(bip->bli_flags & XFS_BLI_HOLD);
647 bip->bli_flags &= ~XFS_BLI_HOLD; 647 bip->bli_flags &= ~XFS_BLI_HOLD;
@@ -704,7 +704,7 @@ xfs_trans_log_buf(xfs_trans_t *tp,
704 bip->bli_flags &= ~XFS_BLI_STALE; 704 bip->bli_flags &= ~XFS_BLI_STALE;
705 ASSERT(XFS_BUF_ISSTALE(bp)); 705 ASSERT(XFS_BUF_ISSTALE(bp));
706 XFS_BUF_UNSTALE(bp); 706 XFS_BUF_UNSTALE(bp);
707 bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL; 707 bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
708 } 708 }
709 709
710 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); 710 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
@@ -762,8 +762,8 @@ xfs_trans_binval(
762 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); 762 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
763 ASSERT(XFS_BUF_ISSTALE(bp)); 763 ASSERT(XFS_BUF_ISSTALE(bp));
764 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); 764 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
765 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF)); 765 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
766 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 766 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
767 ASSERT(lidp->lid_flags & XFS_LID_DIRTY); 767 ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
768 ASSERT(tp->t_flags & XFS_TRANS_DIRTY); 768 ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
769 return; 769 return;
@@ -774,7 +774,7 @@ xfs_trans_binval(
774 * in the buf log item. The STALE flag will be used in 774 * in the buf log item. The STALE flag will be used in
775 * xfs_buf_item_unpin() to determine if it should clean up 775 * xfs_buf_item_unpin() to determine if it should clean up
776 * when the last reference to the buf item is given up. 776 * when the last reference to the buf item is given up.
777 * We set the XFS_BLI_CANCEL flag in the buf log format structure 777 * We set the XFS_BLF_CANCEL flag in the buf log format structure
778 * and log the buf item. This will be used at recovery time 778 * and log the buf item. This will be used at recovery time
779 * to determine that copies of the buffer in the log before 779 * to determine that copies of the buffer in the log before
780 * this should not be replayed. 780 * this should not be replayed.
@@ -792,9 +792,9 @@ xfs_trans_binval(
792 XFS_BUF_UNDELAYWRITE(bp); 792 XFS_BUF_UNDELAYWRITE(bp);
793 XFS_BUF_STALE(bp); 793 XFS_BUF_STALE(bp);
794 bip->bli_flags |= XFS_BLI_STALE; 794 bip->bli_flags |= XFS_BLI_STALE;
795 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY); 795 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
796 bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF; 796 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
797 bip->bli_format.blf_flags |= XFS_BLI_CANCEL; 797 bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
798 memset((char *)(bip->bli_format.blf_data_map), 0, 798 memset((char *)(bip->bli_format.blf_data_map), 0,
799 (bip->bli_format.blf_map_size * sizeof(uint))); 799 (bip->bli_format.blf_map_size * sizeof(uint)));
800 lidp->lid_flags |= XFS_LID_DIRTY; 800 lidp->lid_flags |= XFS_LID_DIRTY;
@@ -802,16 +802,16 @@ xfs_trans_binval(
802} 802}
803 803
804/* 804/*
805 * This call is used to indicate that the buffer contains on-disk 805 * This call is used to indicate that the buffer contains on-disk inodes which
806 * inodes which must be handled specially during recovery. They 806 * must be handled specially during recovery. They require special handling
807 * require special handling because only the di_next_unlinked from 807 * because only the di_next_unlinked from the inodes in the buffer should be
808 * the inodes in the buffer should be recovered. The rest of the 808 * recovered. The rest of the data in the buffer is logged via the inodes
809 * data in the buffer is logged via the inodes themselves. 809 * themselves.
810 * 810 *
811 * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log 811 * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
812 * format structure so that we'll know what to do at recovery time. 812 * transferred to the buffer's log format structure so that we'll know what to
813 * do at recovery time.
813 */ 814 */
814/* ARGSUSED */
815void 815void
816xfs_trans_inode_buf( 816xfs_trans_inode_buf(
817 xfs_trans_t *tp, 817 xfs_trans_t *tp,
@@ -826,7 +826,7 @@ xfs_trans_inode_buf(
826 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 826 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
827 ASSERT(atomic_read(&bip->bli_refcount) > 0); 827 ASSERT(atomic_read(&bip->bli_refcount) > 0);
828 828
829 bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF; 829 bip->bli_flags |= XFS_BLI_INODE_BUF;
830} 830}
831 831
832/* 832/*
@@ -908,9 +908,9 @@ xfs_trans_dquot_buf(
908 ASSERT(XFS_BUF_ISBUSY(bp)); 908 ASSERT(XFS_BUF_ISBUSY(bp));
909 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); 909 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
910 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 910 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
911 ASSERT(type == XFS_BLI_UDQUOT_BUF || 911 ASSERT(type == XFS_BLF_UDQUOT_BUF ||
912 type == XFS_BLI_PDQUOT_BUF || 912 type == XFS_BLF_PDQUOT_BUF ||
913 type == XFS_BLI_GDQUOT_BUF); 913 type == XFS_BLF_GDQUOT_BUF);
914 914
915 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 915 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
916 ASSERT(atomic_read(&bip->bli_refcount) > 0); 916 ASSERT(atomic_read(&bip->bli_refcount) > 0);
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index eb3fc57f9eef..f11d37d06dcc 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
299void 299void
300xfs_trans_free_items( 300xfs_trans_free_items(
301 xfs_trans_t *tp, 301 xfs_trans_t *tp,
302 xfs_lsn_t commit_lsn,
302 int flags) 303 int flags)
303{ 304{
304 xfs_log_item_chunk_t *licp; 305 xfs_log_item_chunk_t *licp;
@@ -311,7 +312,7 @@ xfs_trans_free_items(
311 * Special case the embedded chunk so we don't free it below. 312 * Special case the embedded chunk so we don't free it below.
312 */ 313 */
313 if (!xfs_lic_are_all_free(licp)) { 314 if (!xfs_lic_are_all_free(licp)) {
314 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); 315 (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
315 xfs_lic_all_free(licp); 316 xfs_lic_all_free(licp);
316 licp->lic_unused = 0; 317 licp->lic_unused = 0;
317 } 318 }
@@ -322,7 +323,7 @@ xfs_trans_free_items(
322 */ 323 */
323 while (licp != NULL) { 324 while (licp != NULL) {
324 ASSERT(!xfs_lic_are_all_free(licp)); 325 ASSERT(!xfs_lic_are_all_free(licp));
325 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); 326 (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
326 next_licp = licp->lic_next; 327 next_licp = licp->lic_next;
327 kmem_free(licp); 328 kmem_free(licp);
328 licp = next_licp; 329 licp = next_licp;
@@ -438,112 +439,3 @@ xfs_trans_unlock_chunk(
438 439
439 return freed; 440 return freed;
440} 441}
441
442
443/*
444 * This is called to add the given busy item to the transaction's
445 * list of busy items. It must find a free busy item descriptor
446 * or allocate a new one and add the item to that descriptor.
447 * The function returns a pointer to busy descriptor used to point
448 * to the new busy entry. The log busy entry will now point to its new
449 * descriptor with its ???? field.
450 */
451xfs_log_busy_slot_t *
452xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
453{
454 xfs_log_busy_chunk_t *lbcp;
455 xfs_log_busy_slot_t *lbsp;
456 int i=0;
457
458 /*
459 * If there are no free descriptors, allocate a new chunk
460 * of them and put it at the front of the chunk list.
461 */
462 if (tp->t_busy_free == 0) {
463 lbcp = (xfs_log_busy_chunk_t*)
464 kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP);
465 ASSERT(lbcp != NULL);
466 /*
467 * Initialize the chunk, and then
468 * claim the first slot in the newly allocated chunk.
469 */
470 XFS_LBC_INIT(lbcp);
471 XFS_LBC_CLAIM(lbcp, 0);
472 lbcp->lbc_unused = 1;
473 lbsp = XFS_LBC_SLOT(lbcp, 0);
474
475 /*
476 * Link in the new chunk and update the free count.
477 */
478 lbcp->lbc_next = tp->t_busy.lbc_next;
479 tp->t_busy.lbc_next = lbcp;
480 tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1;
481
482 /*
483 * Initialize the descriptor and the generic portion
484 * of the log item.
485 *
486 * Point the new slot at this item and return it.
487 * Also point the log item at its currently active
488 * descriptor and set the item's mount pointer.
489 */
490 lbsp->lbc_ag = ag;
491 lbsp->lbc_idx = idx;
492 return lbsp;
493 }
494
495 /*
496 * Find the free descriptor. It is somewhere in the chunklist
497 * of descriptors.
498 */
499 lbcp = &tp->t_busy;
500 while (lbcp != NULL) {
501 if (XFS_LBC_VACANCY(lbcp)) {
502 if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) {
503 i = lbcp->lbc_unused;
504 break;
505 } else {
506 /* out-of-order vacancy */
507 cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp);
508 ASSERT(0);
509 }
510 }
511 lbcp = lbcp->lbc_next;
512 }
513 ASSERT(lbcp != NULL);
514 /*
515 * If we find a free descriptor, claim it,
516 * initialize it, and return it.
517 */
518 XFS_LBC_CLAIM(lbcp, i);
519 if (lbcp->lbc_unused <= i) {
520 lbcp->lbc_unused = i + 1;
521 }
522 lbsp = XFS_LBC_SLOT(lbcp, i);
523 tp->t_busy_free--;
524 lbsp->lbc_ag = ag;
525 lbsp->lbc_idx = idx;
526 return lbsp;
527}
528
529
530/*
531 * xfs_trans_free_busy
532 * Free all of the busy lists from a transaction
533 */
534void
535xfs_trans_free_busy(xfs_trans_t *tp)
536{
537 xfs_log_busy_chunk_t *lbcp;
538 xfs_log_busy_chunk_t *lbcq;
539
540 lbcp = tp->t_busy.lbc_next;
541 while (lbcp != NULL) {
542 lbcq = lbcp->lbc_next;
543 kmem_free(lbcp);
544 lbcp = lbcq;
545 }
546
547 XFS_LBC_INIT(&tp->t_busy);
548 tp->t_busy.lbc_unused = 0;
549}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 73e2ad397432..c6e4f2c8de6e 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -35,13 +35,14 @@ struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *,
35struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *); 35struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *);
36struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *, 36struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *,
37 struct xfs_log_item_desc *); 37 struct xfs_log_item_desc *);
38void xfs_trans_free_items(struct xfs_trans *, int); 38
39void xfs_trans_unlock_items(struct xfs_trans *, 39void xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn);
40 xfs_lsn_t); 40void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
41void xfs_trans_free_busy(xfs_trans_t *tp); 41 int flags);
42xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, 42
43 xfs_agnumber_t ag, 43void xfs_trans_item_committed(struct xfs_log_item *lip,
44 xfs_extlen_t idx); 44 xfs_lsn_t commit_lsn, int aborted);
45void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
45 46
46/* 47/*
47 * AIL traversal cursor. 48 * AIL traversal cursor.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b09904555d07..320775295e32 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -75,6 +75,8 @@ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
75 75
76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */ 76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */
77 77
78typedef __uint32_t xlog_tid_t; /* transaction ID type */
79
78/* 80/*
79 * These types are 64 bits on disk but are either 32 or 64 bits in memory. 81 * These types are 64 bits on disk but are either 32 or 64 bits in memory.
80 * Disk based types: 82 * Disk based types:
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index c33749f95b32..058129e9b04c 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -30,8 +30,7 @@
30 * atomic_read - read atomic variable 30 * atomic_read - read atomic variable
31 * @v: pointer of type atomic_t 31 * @v: pointer of type atomic_t
32 * 32 *
33 * Atomically reads the value of @v. Note that the guaranteed 33 * Atomically reads the value of @v.
34 * useful range of an atomic_t is only 24 bits.
35 */ 34 */
36#define atomic_read(v) (*(volatile int *)&(v)->counter) 35#define atomic_read(v) (*(volatile int *)&(v)->counter)
37 36
@@ -40,8 +39,7 @@
40 * @v: pointer of type atomic_t 39 * @v: pointer of type atomic_t
41 * @i: required value 40 * @i: required value
42 * 41 *
43 * Atomically sets the value of @v to @i. Note that the guaranteed 42 * Atomically sets the value of @v to @i.
44 * useful range of an atomic_t is only 24 bits.
45 */ 43 */
46#define atomic_set(v, i) (((v)->counter) = (i)) 44#define atomic_set(v, i) (((v)->counter) = (i))
47 45
@@ -53,7 +51,6 @@
53 * @v: pointer of type atomic_t 51 * @v: pointer of type atomic_t
54 * 52 *
55 * Atomically adds @i to @v and returns the result 53 * Atomically adds @i to @v and returns the result
56 * Note that the guaranteed useful range of an atomic_t is only 24 bits.
57 */ 54 */
58static inline int atomic_add_return(int i, atomic_t *v) 55static inline int atomic_add_return(int i, atomic_t *v)
59{ 56{
@@ -75,7 +72,6 @@ static inline int atomic_add_return(int i, atomic_t *v)
75 * @v: pointer of type atomic_t 72 * @v: pointer of type atomic_t
76 * 73 *
77 * Atomically subtracts @i from @v and returns the result 74 * Atomically subtracts @i from @v and returns the result
78 * Note that the guaranteed useful range of an atomic_t is only 24 bits.
79 */ 75 */
80static inline int atomic_sub_return(int i, atomic_t *v) 76static inline int atomic_sub_return(int i, atomic_t *v)
81{ 77{
diff --git a/include/asm-generic/kmap_types.h b/include/asm-generic/kmap_types.h
index 97e807c8c812..0232ccb76f2b 100644
--- a/include/asm-generic/kmap_types.h
+++ b/include/asm-generic/kmap_types.h
@@ -29,6 +29,9 @@ KMAP_D(16) KM_IRQ_PTE,
29KMAP_D(17) KM_NMI, 29KMAP_D(17) KM_NMI,
30KMAP_D(18) KM_NMI_PTE, 30KMAP_D(18) KM_NMI_PTE,
31KMAP_D(19) KM_KDB, 31KMAP_D(19) KM_KDB,
32/*
33 * Remember to update debug_kmap_atomic() when adding new kmap types!
34 */
32KMAP_D(20) KM_TYPE_NR 35KMAP_D(20) KM_TYPE_NR
33}; 36};
34 37
diff --git a/include/linux/byteorder/big_endian.h b/include/linux/byteorder/big_endian.h
index 3c80fd7e8b56..d53a67dff018 100644
--- a/include/linux/byteorder/big_endian.h
+++ b/include/linux/byteorder/big_endian.h
@@ -7,6 +7,9 @@
7#ifndef __BIG_ENDIAN_BITFIELD 7#ifndef __BIG_ENDIAN_BITFIELD
8#define __BIG_ENDIAN_BITFIELD 8#define __BIG_ENDIAN_BITFIELD
9#endif 9#endif
10#ifndef __BYTE_ORDER
11#define __BYTE_ORDER __BIG_ENDIAN
12#endif
10 13
11#include <linux/types.h> 14#include <linux/types.h>
12#include <linux/swab.h> 15#include <linux/swab.h>
diff --git a/include/linux/byteorder/little_endian.h b/include/linux/byteorder/little_endian.h
index 83195fb82962..f7f8ad13adb6 100644
--- a/include/linux/byteorder/little_endian.h
+++ b/include/linux/byteorder/little_endian.h
@@ -7,6 +7,9 @@
7#ifndef __LITTLE_ENDIAN_BITFIELD 7#ifndef __LITTLE_ENDIAN_BITFIELD
8#define __LITTLE_ENDIAN_BITFIELD 8#define __LITTLE_ENDIAN_BITFIELD
9#endif 9#endif
10#ifndef __BYTE_ORDER
11#define __BYTE_ORDER __LITTLE_ENDIAN
12#endif
10 13
11#include <linux/types.h> 14#include <linux/types.h>
12#include <linux/swab.h> 15#include <linux/swab.h>
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
new file mode 100644
index 000000000000..5ac51552d908
--- /dev/null
+++ b/include/linux/compaction.h
@@ -0,0 +1,89 @@
1#ifndef _LINUX_COMPACTION_H
2#define _LINUX_COMPACTION_H
3
4/* Return values for compact_zone() and try_to_compact_pages() */
5/* compaction didn't start as it was not possible or direct reclaim was more suitable */
6#define COMPACT_SKIPPED 0
7/* compaction should continue to another pageblock */
8#define COMPACT_CONTINUE 1
9/* direct compaction partially compacted a zone and there are suitable pages */
10#define COMPACT_PARTIAL 2
11/* The full zone was compacted */
12#define COMPACT_COMPLETE 3
13
14#ifdef CONFIG_COMPACTION
15extern int sysctl_compact_memory;
16extern int sysctl_compaction_handler(struct ctl_table *table, int write,
17 void __user *buffer, size_t *length, loff_t *ppos);
18extern int sysctl_extfrag_threshold;
19extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
20 void __user *buffer, size_t *length, loff_t *ppos);
21
22extern int fragmentation_index(struct zone *zone, unsigned int order);
23extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
24 int order, gfp_t gfp_mask, nodemask_t *mask);
25
26/* Do not skip compaction more than 64 times */
27#define COMPACT_MAX_DEFER_SHIFT 6
28
29/*
30 * Compaction is deferred when compaction fails to result in a page
31 * allocation success. 1 << compact_defer_limit compactions are skipped up
32 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
33 */
34static inline void defer_compaction(struct zone *zone)
35{
36 zone->compact_considered = 0;
37 zone->compact_defer_shift++;
38
39 if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
40 zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
41}
42
43/* Returns true if compaction should be skipped this time */
44static inline bool compaction_deferred(struct zone *zone)
45{
46 unsigned long defer_limit = 1UL << zone->compact_defer_shift;
47
48 /* Avoid possible overflow */
49 if (++zone->compact_considered > defer_limit)
50 zone->compact_considered = defer_limit;
51
52 return zone->compact_considered < (1UL << zone->compact_defer_shift);
53}
54
55#else
56static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
57 int order, gfp_t gfp_mask, nodemask_t *nodemask)
58{
59 return COMPACT_CONTINUE;
60}
61
62static inline void defer_compaction(struct zone *zone)
63{
64}
65
66static inline bool compaction_deferred(struct zone *zone)
67{
68 return 1;
69}
70
71#endif /* CONFIG_COMPACTION */
72
73#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
74extern int compaction_register_node(struct node *node);
75extern void compaction_unregister_node(struct node *node);
76
77#else
78
79static inline int compaction_register_node(struct node *node)
80{
81 return 0;
82}
83
84static inline void compaction_unregister_node(struct node *node)
85{
86}
87#endif /* CONFIG_COMPACTION && CONFIG_SYSFS && CONFIG_NUMA */
88
89#endif /* _LINUX_COMPACTION_H */
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a73454aec333..20b51cab6593 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -86,9 +86,44 @@ extern void rebuild_sched_domains(void);
86 86
87extern void cpuset_print_task_mems_allowed(struct task_struct *p); 87extern void cpuset_print_task_mems_allowed(struct task_struct *p);
88 88
89/*
90 * reading current mems_allowed and mempolicy in the fastpath must protected
91 * by get_mems_allowed()
92 */
93static inline void get_mems_allowed(void)
94{
95 current->mems_allowed_change_disable++;
96
97 /*
98 * ensure that reading mems_allowed and mempolicy happens after the
99 * update of ->mems_allowed_change_disable.
100 *
101 * the write-side task finds ->mems_allowed_change_disable is not 0,
102 * and knows the read-side task is reading mems_allowed or mempolicy,
103 * so it will clear old bits lazily.
104 */
105 smp_mb();
106}
107
108static inline void put_mems_allowed(void)
109{
110 /*
111 * ensure that reading mems_allowed and mempolicy before reducing
112 * mems_allowed_change_disable.
113 *
114 * the write-side task will know that the read-side task is still
115 * reading mems_allowed or mempolicy, don't clears old bits in the
116 * nodemask.
117 */
118 smp_mb();
119 --ACCESS_ONCE(current->mems_allowed_change_disable);
120}
121
89static inline void set_mems_allowed(nodemask_t nodemask) 122static inline void set_mems_allowed(nodemask_t nodemask)
90{ 123{
124 task_lock(current);
91 current->mems_allowed = nodemask; 125 current->mems_allowed = nodemask;
126 task_unlock(current);
92} 127}
93 128
94#else /* !CONFIG_CPUSETS */ 129#else /* !CONFIG_CPUSETS */
@@ -187,6 +222,14 @@ static inline void set_mems_allowed(nodemask_t nodemask)
187{ 222{
188} 223}
189 224
225static inline void get_mems_allowed(void)
226{
227}
228
229static inline void put_mems_allowed(void)
230{
231}
232
190#endif /* !CONFIG_CPUSETS */ 233#endif /* !CONFIG_CPUSETS */
191 234
192#endif /* _LINUX_CPUSET_H */ 235#endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index f8c2e1767500..b3cd4de9432b 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -28,7 +28,7 @@ struct _ddebug {
28 /* 28 /*
29 * The flags field controls the behaviour at the callsite. 29 * The flags field controls the behaviour at the callsite.
30 * The bits here are changed dynamically when the user 30 * The bits here are changed dynamically when the user
31 * writes commands to <debugfs>/dynamic_debug/ddebug 31 * writes commands to <debugfs>/dynamic_debug/control
32 */ 32 */
33#define _DPRINTK_FLAGS_PRINT (1<<0) /* printk() a message using the format */ 33#define _DPRINTK_FLAGS_PRINT (1<<0) /* printk() a message using the format */
34#define _DPRINTK_FLAGS_DEFAULT 0 34#define _DPRINTK_FLAGS_DEFAULT 0
diff --git a/include/linux/err.h b/include/linux/err.h
index 1b12642636c7..448afc12c78a 100644
--- a/include/linux/err.h
+++ b/include/linux/err.h
@@ -19,22 +19,22 @@
19 19
20#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO) 20#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO)
21 21
22static inline void *ERR_PTR(long error) 22static inline void * __must_check ERR_PTR(long error)
23{ 23{
24 return (void *) error; 24 return (void *) error;
25} 25}
26 26
27static inline long PTR_ERR(const void *ptr) 27static inline long __must_check PTR_ERR(const void *ptr)
28{ 28{
29 return (long) ptr; 29 return (long) ptr;
30} 30}
31 31
32static inline long IS_ERR(const void *ptr) 32static inline long __must_check IS_ERR(const void *ptr)
33{ 33{
34 return IS_ERR_VALUE((unsigned long)ptr); 34 return IS_ERR_VALUE((unsigned long)ptr);
35} 35}
36 36
37static inline long IS_ERR_OR_NULL(const void *ptr) 37static inline long __must_check IS_ERR_OR_NULL(const void *ptr)
38{ 38{
39 return !ptr || IS_ERR_VALUE((unsigned long)ptr); 39 return !ptr || IS_ERR_VALUE((unsigned long)ptr);
40} 40}
@@ -46,7 +46,7 @@ static inline long IS_ERR_OR_NULL(const void *ptr)
46 * Explicitly cast an error-valued pointer to another pointer type in such a 46 * Explicitly cast an error-valued pointer to another pointer type in such a
47 * way as to make it clear that's what's going on. 47 * way as to make it clear that's what's going on.
48 */ 48 */
49static inline void *ERR_CAST(const void *ptr) 49static inline void * __must_check ERR_CAST(const void *ptr)
50{ 50{
51 /* cast away the const */ 51 /* cast away the const */
52 return (void *) ptr; 52 return (void *) ptr;
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 1296af45169d..f3793ebc241c 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -37,7 +37,7 @@ struct dentry;
37#define FBIOGET_HWCINFO 0x4616 37#define FBIOGET_HWCINFO 0x4616
38#define FBIOPUT_MODEINFO 0x4617 38#define FBIOPUT_MODEINFO 0x4617
39#define FBIOGET_DISPINFO 0x4618 39#define FBIOGET_DISPINFO 0x4618
40 40#define FBIO_WAITFORVSYNC _IOW('F', 0x20, __u32)
41 41
42#define FB_TYPE_PACKED_PIXELS 0 /* Packed Pixels */ 42#define FB_TYPE_PACKED_PIXELS 0 /* Packed Pixels */
43#define FB_TYPE_PLANES 1 /* Non interleaved planes */ 43#define FB_TYPE_PLANES 1 /* Non interleaved planes */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 4c6d41333f98..975609cb8548 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -15,7 +15,7 @@ struct vm_area_struct;
15 * Zone modifiers (see linux/mmzone.h - low three bits) 15 * Zone modifiers (see linux/mmzone.h - low three bits)
16 * 16 *
17 * Do not put any conditional on these. If necessary modify the definitions 17 * Do not put any conditional on these. If necessary modify the definitions
18 * without the underscores and use the consistently. The definitions here may 18 * without the underscores and use them consistently. The definitions here may
19 * be used in bit comparisons. 19 * be used in bit comparisons.
20 */ 20 */
21#define __GFP_DMA ((__force gfp_t)0x01u) 21#define __GFP_DMA ((__force gfp_t)0x01u)
@@ -101,7 +101,7 @@ struct vm_area_struct;
101 __GFP_NORETRY|__GFP_NOMEMALLOC) 101 __GFP_NORETRY|__GFP_NOMEMALLOC)
102 102
103/* Control slab gfp mask during early boot */ 103/* Control slab gfp mask during early boot */
104#define GFP_BOOT_MASK __GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS) 104#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS))
105 105
106/* Control allocation constraints */ 106/* Control allocation constraints */
107#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) 107#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
@@ -152,12 +152,12 @@ static inline int allocflags_to_migratetype(gfp_t gfp_flags)
152 * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the 152 * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
153 * zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long 153 * zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long
154 * and there are 16 of them to cover all possible combinations of 154 * and there are 16 of them to cover all possible combinations of
155 * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM 155 * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
156 * 156 *
157 * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA. 157 * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
158 * But GFP_MOVABLE is not only a zone specifier but also an allocation 158 * But GFP_MOVABLE is not only a zone specifier but also an allocation
159 * policy. Therefore __GFP_MOVABLE plus another zone selector is valid. 159 * policy. Therefore __GFP_MOVABLE plus another zone selector is valid.
160 * Only 1bit of the lowest 3 bit (DMA,DMA32,HIGHMEM) can be set to "1". 160 * Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1".
161 * 161 *
162 * bit result 162 * bit result
163 * ================= 163 * =================
@@ -187,7 +187,7 @@ static inline int allocflags_to_migratetype(gfp_t gfp_flags)
187 187
188#define GFP_ZONE_TABLE ( \ 188#define GFP_ZONE_TABLE ( \
189 (ZONE_NORMAL << 0 * ZONES_SHIFT) \ 189 (ZONE_NORMAL << 0 * ZONES_SHIFT) \
190 | (OPT_ZONE_DMA << __GFP_DMA * ZONES_SHIFT) \ 190 | (OPT_ZONE_DMA << __GFP_DMA * ZONES_SHIFT) \
191 | (OPT_ZONE_HIGHMEM << __GFP_HIGHMEM * ZONES_SHIFT) \ 191 | (OPT_ZONE_HIGHMEM << __GFP_HIGHMEM * ZONES_SHIFT) \
192 | (OPT_ZONE_DMA32 << __GFP_DMA32 * ZONES_SHIFT) \ 192 | (OPT_ZONE_DMA32 << __GFP_DMA32 * ZONES_SHIFT) \
193 | (ZONE_NORMAL << __GFP_MOVABLE * ZONES_SHIFT) \ 193 | (ZONE_NORMAL << __GFP_MOVABLE * ZONES_SHIFT) \
@@ -197,7 +197,7 @@ static inline int allocflags_to_migratetype(gfp_t gfp_flags)
197) 197)
198 198
199/* 199/*
200 * GFP_ZONE_BAD is a bitmap for all combination of __GFP_DMA, __GFP_DMA32 200 * GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32
201 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per 201 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per
202 * entry starting with bit 0. Bit is set if the combination is not 202 * entry starting with bit 0. Bit is set if the combination is not
203 * allowed. 203 * allowed.
@@ -320,17 +320,17 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
320void free_pages_exact(void *virt, size_t size); 320void free_pages_exact(void *virt, size_t size);
321 321
322#define __get_free_page(gfp_mask) \ 322#define __get_free_page(gfp_mask) \
323 __get_free_pages((gfp_mask),0) 323 __get_free_pages((gfp_mask), 0)
324 324
325#define __get_dma_pages(gfp_mask, order) \ 325#define __get_dma_pages(gfp_mask, order) \
326 __get_free_pages((gfp_mask) | GFP_DMA,(order)) 326 __get_free_pages((gfp_mask) | GFP_DMA, (order))
327 327
328extern void __free_pages(struct page *page, unsigned int order); 328extern void __free_pages(struct page *page, unsigned int order);
329extern void free_pages(unsigned long addr, unsigned int order); 329extern void free_pages(unsigned long addr, unsigned int order);
330extern void free_hot_cold_page(struct page *page, int cold); 330extern void free_hot_cold_page(struct page *page, int cold);
331 331
332#define __free_page(page) __free_pages((page), 0) 332#define __free_page(page) __free_pages((page), 0)
333#define free_page(addr) free_pages((addr),0) 333#define free_page(addr) free_pages((addr), 0)
334 334
335void page_alloc_init(void); 335void page_alloc_init(void);
336void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); 336void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 74152c08ad07..caafd0561aa1 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -27,7 +27,7 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
27 27
28#include <asm/kmap_types.h> 28#include <asm/kmap_types.h>
29 29
30#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT) 30#ifdef CONFIG_DEBUG_HIGHMEM
31 31
32void debug_kmap_atomic(enum km_type type); 32void debug_kmap_atomic(enum km_type type);
33 33
diff --git a/include/linux/ivtvfb.h b/include/linux/ivtvfb.h
index 9d88b29ddf55..e8b92f67f10d 100644
--- a/include/linux/ivtvfb.h
+++ b/include/linux/ivtvfb.h
@@ -33,6 +33,5 @@ struct ivtvfb_dma_frame {
33}; 33};
34 34
35#define IVTVFB_IOC_DMA_FRAME _IOW('V', BASE_VIDIOC_PRIVATE+0, struct ivtvfb_dma_frame) 35#define IVTVFB_IOC_DMA_FRAME _IOW('V', BASE_VIDIOC_PRIVATE+0, struct ivtvfb_dma_frame)
36#define FBIO_WAITFORVSYNC _IOW('F', 0x20, __u32)
37 36
38#endif 37#endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index cc5e3ffe9fce..8317ec4b9f3b 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -24,9 +24,9 @@
24extern const char linux_banner[]; 24extern const char linux_banner[];
25extern const char linux_proc_banner[]; 25extern const char linux_proc_banner[];
26 26
27#define USHORT_MAX ((u16)(~0U)) 27#define USHRT_MAX ((u16)(~0U))
28#define SHORT_MAX ((s16)(USHORT_MAX>>1)) 28#define SHRT_MAX ((s16)(USHRT_MAX>>1))
29#define SHORT_MIN (-SHORT_MAX - 1) 29#define SHRT_MIN ((s16)(-SHRT_MAX - 1))
30#define INT_MAX ((int)(~0U>>1)) 30#define INT_MAX ((int)(~0U>>1))
31#define INT_MIN (-INT_MAX - 1) 31#define INT_MIN (-INT_MAX - 1)
32#define UINT_MAX (~0U) 32#define UINT_MAX (~0U)
@@ -375,6 +375,8 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
375 return buf; 375 return buf;
376} 376}
377 377
378extern int hex_to_bin(char ch);
379
378#ifndef pr_fmt 380#ifndef pr_fmt
379#define pr_fmt(fmt) fmt 381#define pr_fmt(fmt) fmt
380#endif 382#endif
@@ -389,6 +391,7 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
389 printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) 391 printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
390#define pr_warning(fmt, ...) \ 392#define pr_warning(fmt, ...) \
391 printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) 393 printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
394#define pr_warn pr_warning
392#define pr_notice(fmt, ...) \ 395#define pr_notice(fmt, ...) \
393 printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) 396 printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
394#define pr_info(fmt, ...) \ 397#define pr_info(fmt, ...) \
@@ -423,14 +426,13 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
423 * no local ratelimit_state used in the !PRINTK case 426 * no local ratelimit_state used in the !PRINTK case
424 */ 427 */
425#ifdef CONFIG_PRINTK 428#ifdef CONFIG_PRINTK
426#define printk_ratelimited(fmt, ...) ({ \ 429#define printk_ratelimited(fmt, ...) ({ \
427 static struct ratelimit_state _rs = { \ 430 static DEFINE_RATELIMIT_STATE(_rs, \
428 .interval = DEFAULT_RATELIMIT_INTERVAL, \ 431 DEFAULT_RATELIMIT_INTERVAL, \
429 .burst = DEFAULT_RATELIMIT_BURST, \ 432 DEFAULT_RATELIMIT_BURST); \
430 }; \ 433 \
431 \ 434 if (__ratelimit(&_rs)) \
432 if (__ratelimit(&_rs)) \ 435 printk(fmt, ##__VA_ARGS__); \
433 printk(fmt, ##__VA_ARGS__); \
434}) 436})
435#else 437#else
436/* No effect, but we still get type checking even in the !PRINTK case: */ 438/* No effect, but we still get type checking even in the !PRINTK case: */
@@ -447,6 +449,7 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
447 printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) 449 printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
448#define pr_warning_ratelimited(fmt, ...) \ 450#define pr_warning_ratelimited(fmt, ...) \
449 printk_ratelimited(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) 451 printk_ratelimited(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
452#define pr_warn_ratelimited pr_warning_ratelimited
450#define pr_notice_ratelimited(fmt, ...) \ 453#define pr_notice_ratelimited(fmt, ...) \
451 printk_ratelimited(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) 454 printk_ratelimited(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
452#define pr_info_ratelimited(fmt, ...) \ 455#define pr_info_ratelimited(fmt, ...) \
diff --git a/include/linux/lis3lv02d.h b/include/linux/lis3lv02d.h
index f1ca0dcc1628..0e8a346424bb 100644
--- a/include/linux/lis3lv02d.h
+++ b/include/linux/lis3lv02d.h
@@ -25,12 +25,14 @@ struct lis3lv02d_platform_data {
25#define LIS3_IRQ1_FF_WU_12 (3 << 0) 25#define LIS3_IRQ1_FF_WU_12 (3 << 0)
26#define LIS3_IRQ1_DATA_READY (4 << 0) 26#define LIS3_IRQ1_DATA_READY (4 << 0)
27#define LIS3_IRQ1_CLICK (7 << 0) 27#define LIS3_IRQ1_CLICK (7 << 0)
28#define LIS3_IRQ1_MASK (7 << 0)
28#define LIS3_IRQ2_DISABLE (0 << 3) 29#define LIS3_IRQ2_DISABLE (0 << 3)
29#define LIS3_IRQ2_FF_WU_1 (1 << 3) 30#define LIS3_IRQ2_FF_WU_1 (1 << 3)
30#define LIS3_IRQ2_FF_WU_2 (2 << 3) 31#define LIS3_IRQ2_FF_WU_2 (2 << 3)
31#define LIS3_IRQ2_FF_WU_12 (3 << 3) 32#define LIS3_IRQ2_FF_WU_12 (3 << 3)
32#define LIS3_IRQ2_DATA_READY (4 << 3) 33#define LIS3_IRQ2_DATA_READY (4 << 3)
33#define LIS3_IRQ2_CLICK (7 << 3) 34#define LIS3_IRQ2_CLICK (7 << 3)
35#define LIS3_IRQ2_MASK (7 << 3)
34#define LIS3_IRQ_OPEN_DRAIN (1 << 6) 36#define LIS3_IRQ_OPEN_DRAIN (1 << 6)
35#define LIS3_IRQ_ACTIVE_LOW (1 << 7) 37#define LIS3_IRQ_ACTIVE_LOW (1 << 7)
36 unsigned char irq_cfg; 38 unsigned char irq_cfg;
@@ -43,6 +45,15 @@ struct lis3lv02d_platform_data {
43#define LIS3_WAKEUP_Z_HI (1 << 5) 45#define LIS3_WAKEUP_Z_HI (1 << 5)
44 unsigned char wakeup_flags; 46 unsigned char wakeup_flags;
45 unsigned char wakeup_thresh; 47 unsigned char wakeup_thresh;
48 unsigned char wakeup_flags2;
49 unsigned char wakeup_thresh2;
50#define LIS3_HIPASS_CUTFF_8HZ 0
51#define LIS3_HIPASS_CUTFF_4HZ 1
52#define LIS3_HIPASS_CUTFF_2HZ 2
53#define LIS3_HIPASS_CUTFF_1HZ 3
54#define LIS3_HIPASS1_DISABLE (1 << 2)
55#define LIS3_HIPASS2_DISABLE (1 << 3)
56 unsigned char hipass_ctrl;
46#define LIS3_NO_MAP 0 57#define LIS3_NO_MAP 0
47#define LIS3_DEV_X 1 58#define LIS3_DEV_X 1
48#define LIS3_DEV_Y 2 59#define LIS3_DEV_Y 2
@@ -58,6 +69,7 @@ struct lis3lv02d_platform_data {
58 /* Limits for selftest are specified in chip data sheet */ 69 /* Limits for selftest are specified in chip data sheet */
59 s16 st_min_limits[3]; /* min pass limit x, y, z */ 70 s16 st_min_limits[3]; /* min pass limit x, y, z */
60 s16 st_max_limits[3]; /* max pass limit x, y, z */ 71 s16 st_max_limits[3]; /* max pass limit x, y, z */
72 int irq2;
61}; 73};
62 74
63#endif /* __LIS3LV02D_H_ */ 75#endif /* __LIS3LV02D_H_ */
diff --git a/include/linux/matroxfb.h b/include/linux/matroxfb.h
index 2203121a43e9..8c22a8938642 100644
--- a/include/linux/matroxfb.h
+++ b/include/linux/matroxfb.h
@@ -4,6 +4,7 @@
4#include <asm/ioctl.h> 4#include <asm/ioctl.h>
5#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/videodev2.h> 6#include <linux/videodev2.h>
7#include <linux/fb.h>
7 8
8struct matroxioc_output_mode { 9struct matroxioc_output_mode {
9 __u32 output; /* which output */ 10 __u32 output; /* which output */
@@ -37,7 +38,5 @@ enum matroxfb_ctrl_id {
37 MATROXFB_CID_LAST 38 MATROXFB_CID_LAST
38}; 39};
39 40
40#define FBIO_WAITFORVSYNC _IOW('F', 0x20, __u32)
41
42#endif 41#endif
43 42
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 44301c6affa8..05894795fdc1 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -25,6 +25,13 @@ struct page_cgroup;
25struct page; 25struct page;
26struct mm_struct; 26struct mm_struct;
27 27
28extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
29 struct list_head *dst,
30 unsigned long *scanned, int order,
31 int mode, struct zone *z,
32 struct mem_cgroup *mem_cont,
33 int active, int file);
34
28#ifdef CONFIG_CGROUP_MEM_RES_CTLR 35#ifdef CONFIG_CGROUP_MEM_RES_CTLR
29/* 36/*
30 * All "charge" functions with gfp_mask should use GFP_KERNEL or 37 * All "charge" functions with gfp_mask should use GFP_KERNEL or
@@ -64,12 +71,6 @@ extern void mem_cgroup_uncharge_cache_page(struct page *page);
64extern int mem_cgroup_shmem_charge_fallback(struct page *page, 71extern int mem_cgroup_shmem_charge_fallback(struct page *page,
65 struct mm_struct *mm, gfp_t gfp_mask); 72 struct mm_struct *mm, gfp_t gfp_mask);
66 73
67extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
68 struct list_head *dst,
69 unsigned long *scanned, int order,
70 int mode, struct zone *z,
71 struct mem_cgroup *mem_cont,
72 int active, int file);
73extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); 74extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
74int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); 75int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
75 76
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 35b07b773e6c..864035fb8f8a 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -202,6 +202,7 @@ static inline int is_mem_section_removable(unsigned long pfn,
202} 202}
203#endif /* CONFIG_MEMORY_HOTREMOVE */ 203#endif /* CONFIG_MEMORY_HOTREMOVE */
204 204
205extern int mem_online_node(int nid);
205extern int add_memory(int nid, u64 start, u64 size); 206extern int add_memory(int nid, u64 start, u64 size);
206extern int arch_add_memory(int nid, u64 start, u64 size); 207extern int arch_add_memory(int nid, u64 start, u64 size);
207extern int remove_memory(u64 start, u64 size); 208extern int remove_memory(u64 start, u64 size);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 1cc966cd3e5f..7b9ef6bf45aa 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -23,6 +23,13 @@ enum {
23 MPOL_MAX, /* always last member of enum */ 23 MPOL_MAX, /* always last member of enum */
24}; 24};
25 25
26enum mpol_rebind_step {
27 MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */
28 MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */
29 MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/
30 MPOL_REBIND_NSTEP,
31};
32
26/* Flags for set_mempolicy */ 33/* Flags for set_mempolicy */
27#define MPOL_F_STATIC_NODES (1 << 15) 34#define MPOL_F_STATIC_NODES (1 << 15)
28#define MPOL_F_RELATIVE_NODES (1 << 14) 35#define MPOL_F_RELATIVE_NODES (1 << 14)
@@ -51,6 +58,7 @@ enum {
51 */ 58 */
52#define MPOL_F_SHARED (1 << 0) /* identify shared policies */ 59#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
53#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ 60#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
61#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */
54 62
55#ifdef __KERNEL__ 63#ifdef __KERNEL__
56 64
@@ -193,8 +201,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
193 201
194extern void numa_default_policy(void); 202extern void numa_default_policy(void);
195extern void numa_policy_init(void); 203extern void numa_policy_init(void);
196extern void mpol_rebind_task(struct task_struct *tsk, 204extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
197 const nodemask_t *new); 205 enum mpol_rebind_step step);
198extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); 206extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
199extern void mpol_fix_fork_child_flag(struct task_struct *p); 207extern void mpol_fix_fork_child_flag(struct task_struct *p);
200 208
@@ -308,7 +316,8 @@ static inline void numa_default_policy(void)
308} 316}
309 317
310static inline void mpol_rebind_task(struct task_struct *tsk, 318static inline void mpol_rebind_task(struct task_struct *tsk,
311 const nodemask_t *new) 319 const nodemask_t *new,
320 enum mpol_rebind_step step)
312{ 321{
313} 322}
314 323
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 7f085c97c799..7238231b8dd4 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -9,7 +9,7 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **);
9#ifdef CONFIG_MIGRATION 9#ifdef CONFIG_MIGRATION
10#define PAGE_MIGRATION 1 10#define PAGE_MIGRATION 1
11 11
12extern int putback_lru_pages(struct list_head *l); 12extern void putback_lru_pages(struct list_head *l);
13extern int migrate_page(struct address_space *, 13extern int migrate_page(struct address_space *,
14 struct page *, struct page *); 14 struct page *, struct page *);
15extern int migrate_pages(struct list_head *l, new_page_t x, 15extern int migrate_pages(struct list_head *l, new_page_t x,
@@ -19,17 +19,19 @@ extern int fail_migrate_page(struct address_space *,
19 struct page *, struct page *); 19 struct page *, struct page *);
20 20
21extern int migrate_prep(void); 21extern int migrate_prep(void);
22extern int migrate_prep_local(void);
22extern int migrate_vmas(struct mm_struct *mm, 23extern int migrate_vmas(struct mm_struct *mm,
23 const nodemask_t *from, const nodemask_t *to, 24 const nodemask_t *from, const nodemask_t *to,
24 unsigned long flags); 25 unsigned long flags);
25#else 26#else
26#define PAGE_MIGRATION 0 27#define PAGE_MIGRATION 0
27 28
28static inline int putback_lru_pages(struct list_head *l) { return 0; } 29static inline void putback_lru_pages(struct list_head *l) {}
29static inline int migrate_pages(struct list_head *l, new_page_t x, 30static inline int migrate_pages(struct list_head *l, new_page_t x,
30 unsigned long private, int offlining) { return -ENOSYS; } 31 unsigned long private, int offlining) { return -ENOSYS; }
31 32
32static inline int migrate_prep(void) { return -ENOSYS; } 33static inline int migrate_prep(void) { return -ENOSYS; }
34static inline int migrate_prep_local(void) { return -ENOSYS; }
33 35
34static inline int migrate_vmas(struct mm_struct *mm, 36static inline int migrate_vmas(struct mm_struct *mm,
35 const nodemask_t *from, const nodemask_t *to, 37 const nodemask_t *from, const nodemask_t *to,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fb19bb92b809..b969efb03787 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -13,6 +13,7 @@
13#include <linux/debug_locks.h> 13#include <linux/debug_locks.h>
14#include <linux/mm_types.h> 14#include <linux/mm_types.h>
15#include <linux/range.h> 15#include <linux/range.h>
16#include <linux/pfn.h>
16 17
17struct mempolicy; 18struct mempolicy;
18struct anon_vma; 19struct anon_vma;
@@ -106,6 +107,9 @@ extern unsigned int kobjsize(const void *objp);
106#define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ 107#define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */
107#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ 108#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
108 109
110/* Bits set in the VMA until the stack is in its final location */
111#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
112
109#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ 113#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
110#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS 114#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
111#endif 115#endif
@@ -334,6 +338,7 @@ void put_page(struct page *page);
334void put_pages_list(struct list_head *pages); 338void put_pages_list(struct list_head *pages);
335 339
336void split_page(struct page *page, unsigned int order); 340void split_page(struct page *page, unsigned int order);
341int split_free_page(struct page *page);
337 342
338/* 343/*
339 * Compound pages have a destructor function. Provide a 344 * Compound pages have a destructor function. Provide a
@@ -591,7 +596,7 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
591 596
592static __always_inline void *lowmem_page_address(struct page *page) 597static __always_inline void *lowmem_page_address(struct page *page)
593{ 598{
594 return __va(page_to_pfn(page) << PAGE_SHIFT); 599 return __va(PFN_PHYS(page_to_pfn(page)));
595} 600}
596 601
597#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) 602#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cf9e458e96b0..0fa491326c4a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -321,6 +321,15 @@ struct zone {
321 unsigned long *pageblock_flags; 321 unsigned long *pageblock_flags;
322#endif /* CONFIG_SPARSEMEM */ 322#endif /* CONFIG_SPARSEMEM */
323 323
324#ifdef CONFIG_COMPACTION
325 /*
326 * On compaction failure, 1<<compact_defer_shift compactions
327 * are skipped before trying again. The number attempted since
328 * last failure is tracked with compact_considered.
329 */
330 unsigned int compact_considered;
331 unsigned int compact_defer_shift;
332#endif
324 333
325 ZONE_PADDING(_pad1_) 334 ZONE_PADDING(_pad1_)
326 335
@@ -641,9 +650,10 @@ typedef struct pglist_data {
641 650
642#include <linux/memory_hotplug.h> 651#include <linux/memory_hotplug.h>
643 652
653extern struct mutex zonelists_mutex;
644void get_zone_counts(unsigned long *active, unsigned long *inactive, 654void get_zone_counts(unsigned long *active, unsigned long *inactive,
645 unsigned long *free); 655 unsigned long *free);
646void build_all_zonelists(void); 656void build_all_zonelists(void *data);
647void wakeup_kswapd(struct zone *zone, int order); 657void wakeup_kswapd(struct zone *zone, int order);
648int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 658int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
649 int classzone_idx, int alloc_flags); 659 int classzone_idx, int alloc_flags);
@@ -972,7 +982,7 @@ struct mem_section {
972#endif 982#endif
973 983
974#define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) 984#define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT)
975#define NR_SECTION_ROOTS (NR_MEM_SECTIONS / SECTIONS_PER_ROOT) 985#define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
976#define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) 986#define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1)
977 987
978#ifdef CONFIG_SPARSEMEM_EXTREME 988#ifdef CONFIG_SPARSEMEM_EXTREME
diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h
index 668cf1bef030..8f69d09a41a5 100644
--- a/include/linux/ratelimit.h
+++ b/include/linux/ratelimit.h
@@ -2,7 +2,7 @@
2#define _LINUX_RATELIMIT_H 2#define _LINUX_RATELIMIT_H
3 3
4#include <linux/param.h> 4#include <linux/param.h>
5#include <linux/spinlock_types.h> 5#include <linux/spinlock.h>
6 6
7#define DEFAULT_RATELIMIT_INTERVAL (5 * HZ) 7#define DEFAULT_RATELIMIT_INTERVAL (5 * HZ)
8#define DEFAULT_RATELIMIT_BURST 10 8#define DEFAULT_RATELIMIT_BURST 10
@@ -25,6 +25,17 @@ struct ratelimit_state {
25 .burst = burst_init, \ 25 .burst = burst_init, \
26 } 26 }
27 27
28static inline void ratelimit_state_init(struct ratelimit_state *rs,
29 int interval, int burst)
30{
31 spin_lock_init(&rs->lock);
32 rs->interval = interval;
33 rs->burst = burst;
34 rs->printed = 0;
35 rs->missed = 0;
36 rs->begin = 0;
37}
38
28extern int ___ratelimit(struct ratelimit_state *rs, const char *func); 39extern int ___ratelimit(struct ratelimit_state *rs, const char *func);
29#define __ratelimit(state) ___ratelimit(state, __func__) 40#define __ratelimit(state) ___ratelimit(state, __func__)
30 41
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d25bd224d370..77216742c178 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -26,8 +26,17 @@
26 */ 26 */
27struct anon_vma { 27struct anon_vma {
28 spinlock_t lock; /* Serialize access to vma list */ 28 spinlock_t lock; /* Serialize access to vma list */
29#ifdef CONFIG_KSM 29#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
30 atomic_t ksm_refcount; 30
31 /*
32 * The external_refcount is taken by either KSM or page migration
33 * to take a reference to an anon_vma when there is no
34 * guarantee that the vma of page tables will exist for
35 * the duration of the operation. A caller that takes
36 * the reference is responsible for clearing up the
37 * anon_vma if they are the last user on release
38 */
39 atomic_t external_refcount;
31#endif 40#endif
32 /* 41 /*
33 * NOTE: the LSB of the head.next is set by 42 * NOTE: the LSB of the head.next is set by
@@ -61,22 +70,22 @@ struct anon_vma_chain {
61}; 70};
62 71
63#ifdef CONFIG_MMU 72#ifdef CONFIG_MMU
64#ifdef CONFIG_KSM 73#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
65static inline void ksm_refcount_init(struct anon_vma *anon_vma) 74static inline void anonvma_external_refcount_init(struct anon_vma *anon_vma)
66{ 75{
67 atomic_set(&anon_vma->ksm_refcount, 0); 76 atomic_set(&anon_vma->external_refcount, 0);
68} 77}
69 78
70static inline int ksm_refcount(struct anon_vma *anon_vma) 79static inline int anonvma_external_refcount(struct anon_vma *anon_vma)
71{ 80{
72 return atomic_read(&anon_vma->ksm_refcount); 81 return atomic_read(&anon_vma->external_refcount);
73} 82}
74#else 83#else
75static inline void ksm_refcount_init(struct anon_vma *anon_vma) 84static inline void anonvma_external_refcount_init(struct anon_vma *anon_vma)
76{ 85{
77} 86}
78 87
79static inline int ksm_refcount(struct anon_vma *anon_vma) 88static inline int anonvma_external_refcount(struct anon_vma *anon_vma)
80{ 89{
81 return 0; 90 return 0;
82} 91}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b55e988988b5..c0151ffd3541 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -384,7 +384,7 @@ struct user_namespace;
384 * 1-3 now and depends on arch. We use "5" as safe margin, here. 384 * 1-3 now and depends on arch. We use "5" as safe margin, here.
385 */ 385 */
386#define MAPCOUNT_ELF_CORE_MARGIN (5) 386#define MAPCOUNT_ELF_CORE_MARGIN (5)
387#define DEFAULT_MAX_MAP_COUNT (USHORT_MAX - MAPCOUNT_ELF_CORE_MARGIN) 387#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
388 388
389extern int sysctl_max_map_count; 389extern int sysctl_max_map_count;
390 390
@@ -1421,6 +1421,7 @@ struct task_struct {
1421#endif 1421#endif
1422#ifdef CONFIG_CPUSETS 1422#ifdef CONFIG_CPUSETS
1423 nodemask_t mems_allowed; /* Protected by alloc_lock */ 1423 nodemask_t mems_allowed; /* Protected by alloc_lock */
1424 int mems_allowed_change_disable;
1424 int cpuset_mem_spread_rotor; 1425 int cpuset_mem_spread_rotor;
1425#endif 1426#endif
1426#ifdef CONFIG_CGROUPS 1427#ifdef CONFIG_CGROUPS
diff --git a/include/linux/swap.h b/include/linux/swap.h
index ec2b7a42b45f..b6b614364dd8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -152,6 +152,7 @@ enum {
152}; 152};
153 153
154#define SWAP_CLUSTER_MAX 32 154#define SWAP_CLUSTER_MAX 32
155#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
155 156
156#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */ 157#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */
157#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */ 158#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */
@@ -224,20 +225,15 @@ static inline void lru_cache_add_anon(struct page *page)
224 __lru_cache_add(page, LRU_INACTIVE_ANON); 225 __lru_cache_add(page, LRU_INACTIVE_ANON);
225} 226}
226 227
227static inline void lru_cache_add_active_anon(struct page *page)
228{
229 __lru_cache_add(page, LRU_ACTIVE_ANON);
230}
231
232static inline void lru_cache_add_file(struct page *page) 228static inline void lru_cache_add_file(struct page *page)
233{ 229{
234 __lru_cache_add(page, LRU_INACTIVE_FILE); 230 __lru_cache_add(page, LRU_INACTIVE_FILE);
235} 231}
236 232
237static inline void lru_cache_add_active_file(struct page *page) 233/* LRU Isolation modes. */
238{ 234#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */
239 __lru_cache_add(page, LRU_ACTIVE_FILE); 235#define ISOLATE_ACTIVE 1 /* Isolate active pages. */
240} 236#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */
241 237
242/* linux/mm/vmscan.c */ 238/* linux/mm/vmscan.c */
243extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 239extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 117f0dd8ad03..7f43ccdc1d38 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -43,6 +43,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
43 KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, 43 KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
44 KSWAPD_SKIP_CONGESTION_WAIT, 44 KSWAPD_SKIP_CONGESTION_WAIT,
45 PAGEOUTRUN, ALLOCSTALL, PGROTATED, 45 PAGEOUTRUN, ALLOCSTALL, PGROTATED,
46#ifdef CONFIG_COMPACTION
47 COMPACTBLOCKS, COMPACTPAGES, COMPACTPAGEFAILED,
48 COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,
49#endif
46#ifdef CONFIG_HUGETLB_PAGE 50#ifdef CONFIG_HUGETLB_PAGE
47 HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, 51 HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
48#endif 52#endif
diff --git a/include/net/ip.h b/include/net/ip.h
index 63548f0a44b1..452f229c380a 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -358,11 +358,11 @@ enum ip_defrag_users {
358 IP_DEFRAG_LOCAL_DELIVER, 358 IP_DEFRAG_LOCAL_DELIVER,
359 IP_DEFRAG_CALL_RA_CHAIN, 359 IP_DEFRAG_CALL_RA_CHAIN,
360 IP_DEFRAG_CONNTRACK_IN, 360 IP_DEFRAG_CONNTRACK_IN,
361 __IP_DEFRAG_CONNTRACK_IN_END = IP_DEFRAG_CONNTRACK_IN + USHORT_MAX, 361 __IP_DEFRAG_CONNTRACK_IN_END = IP_DEFRAG_CONNTRACK_IN + USHRT_MAX,
362 IP_DEFRAG_CONNTRACK_OUT, 362 IP_DEFRAG_CONNTRACK_OUT,
363 __IP_DEFRAG_CONNTRACK_OUT_END = IP_DEFRAG_CONNTRACK_OUT + USHORT_MAX, 363 __IP_DEFRAG_CONNTRACK_OUT_END = IP_DEFRAG_CONNTRACK_OUT + USHRT_MAX,
364 IP_DEFRAG_CONNTRACK_BRIDGE_IN, 364 IP_DEFRAG_CONNTRACK_BRIDGE_IN,
365 __IP_DEFRAG_CONNTRACK_BRIDGE_IN = IP_DEFRAG_CONNTRACK_BRIDGE_IN + USHORT_MAX, 365 __IP_DEFRAG_CONNTRACK_BRIDGE_IN = IP_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
366 IP_DEFRAG_VS_IN, 366 IP_DEFRAG_VS_IN,
367 IP_DEFRAG_VS_OUT, 367 IP_DEFRAG_VS_OUT,
368 IP_DEFRAG_VS_FWD 368 IP_DEFRAG_VS_FWD
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index eba5cc00325a..2600b69757b8 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -354,11 +354,11 @@ struct inet_frag_queue;
354enum ip6_defrag_users { 354enum ip6_defrag_users {
355 IP6_DEFRAG_LOCAL_DELIVER, 355 IP6_DEFRAG_LOCAL_DELIVER,
356 IP6_DEFRAG_CONNTRACK_IN, 356 IP6_DEFRAG_CONNTRACK_IN,
357 __IP6_DEFRAG_CONNTRACK_IN = IP6_DEFRAG_CONNTRACK_IN + USHORT_MAX, 357 __IP6_DEFRAG_CONNTRACK_IN = IP6_DEFRAG_CONNTRACK_IN + USHRT_MAX,
358 IP6_DEFRAG_CONNTRACK_OUT, 358 IP6_DEFRAG_CONNTRACK_OUT,
359 __IP6_DEFRAG_CONNTRACK_OUT = IP6_DEFRAG_CONNTRACK_OUT + USHORT_MAX, 359 __IP6_DEFRAG_CONNTRACK_OUT = IP6_DEFRAG_CONNTRACK_OUT + USHRT_MAX,
360 IP6_DEFRAG_CONNTRACK_BRIDGE_IN, 360 IP6_DEFRAG_CONNTRACK_BRIDGE_IN,
361 __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHORT_MAX, 361 __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
362}; 362};
363 363
364struct ip6_create_arg { 364struct ip6_create_arg {
diff --git a/include/video/da8xx-fb.h b/include/video/da8xx-fb.h
index 89d43b3d4cb9..6316cdabf73f 100644
--- a/include/video/da8xx-fb.h
+++ b/include/video/da8xx-fb.h
@@ -99,6 +99,7 @@ struct lcd_sync_arg {
99#define FBIPUT_COLOR _IOW('F', 6, int) 99#define FBIPUT_COLOR _IOW('F', 6, int)
100#define FBIPUT_HSYNC _IOW('F', 9, int) 100#define FBIPUT_HSYNC _IOW('F', 9, int)
101#define FBIPUT_VSYNC _IOW('F', 10, int) 101#define FBIPUT_VSYNC _IOW('F', 10, int)
102#define FBIO_WAITFORVSYNC _IOW('F', 0x20, u_int32_t)
102 103
103#endif /* ifndef DA8XX_FB_H */ 104#endif /* ifndef DA8XX_FB_H */
104 105
diff --git a/include/video/sh_mobile_lcdc.h b/include/video/sh_mobile_lcdc.h
index 2cc893fc1f85..288205457713 100644
--- a/include/video/sh_mobile_lcdc.h
+++ b/include/video/sh_mobile_lcdc.h
@@ -34,8 +34,6 @@ enum { LCDC_CLK_BUS, LCDC_CLK_PERIPHERAL, LCDC_CLK_EXTERNAL };
34#define LCDC_FLAGS_HSCNT (1 << 3) /* Disable HSYNC during VBLANK */ 34#define LCDC_FLAGS_HSCNT (1 << 3) /* Disable HSYNC during VBLANK */
35#define LCDC_FLAGS_DWCNT (1 << 4) /* Disable dotclock during blanking */ 35#define LCDC_FLAGS_DWCNT (1 << 4) /* Disable dotclock during blanking */
36 36
37#define FBIO_WAITFORVSYNC _IOW('F', 0x20, __u32)
38
39struct sh_mobile_lcdc_sys_bus_cfg { 37struct sh_mobile_lcdc_sys_bus_cfg {
40 unsigned long ldmt2r; 38 unsigned long ldmt2r;
41 unsigned long ldmt3r; 39 unsigned long ldmt3r;
diff --git a/init/main.c b/init/main.c
index 22881b5e95e3..3bdb152f412f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -567,7 +567,7 @@ asmlinkage void __init start_kernel(void)
567 setup_per_cpu_areas(); 567 setup_per_cpu_areas();
568 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ 568 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
569 569
570 build_all_zonelists(); 570 build_all_zonelists(NULL);
571 page_alloc_init(); 571 page_alloc_init();
572 572
573 printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); 573 printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
diff --git a/ipc/msg.c b/ipc/msg.c
index 9547cb7ac313..747b65507a91 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -345,19 +345,19 @@ copy_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version)
345 out.msg_rtime = in->msg_rtime; 345 out.msg_rtime = in->msg_rtime;
346 out.msg_ctime = in->msg_ctime; 346 out.msg_ctime = in->msg_ctime;
347 347
348 if (in->msg_cbytes > USHORT_MAX) 348 if (in->msg_cbytes > USHRT_MAX)
349 out.msg_cbytes = USHORT_MAX; 349 out.msg_cbytes = USHRT_MAX;
350 else 350 else
351 out.msg_cbytes = in->msg_cbytes; 351 out.msg_cbytes = in->msg_cbytes;
352 out.msg_lcbytes = in->msg_cbytes; 352 out.msg_lcbytes = in->msg_cbytes;
353 353
354 if (in->msg_qnum > USHORT_MAX) 354 if (in->msg_qnum > USHRT_MAX)
355 out.msg_qnum = USHORT_MAX; 355 out.msg_qnum = USHRT_MAX;
356 else 356 else
357 out.msg_qnum = in->msg_qnum; 357 out.msg_qnum = in->msg_qnum;
358 358
359 if (in->msg_qbytes > USHORT_MAX) 359 if (in->msg_qbytes > USHRT_MAX)
360 out.msg_qbytes = USHORT_MAX; 360 out.msg_qbytes = USHRT_MAX;
361 else 361 else
362 out.msg_qbytes = in->msg_qbytes; 362 out.msg_qbytes = in->msg_qbytes;
363 out.msg_lqbytes = in->msg_qbytes; 363 out.msg_lqbytes = in->msg_qbytes;
diff --git a/ipc/util.c b/ipc/util.c
index 79ce84e890f7..69a0cc13d966 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -124,8 +124,8 @@ void ipc_init_ids(struct ipc_ids *ids)
124 ids->seq = 0; 124 ids->seq = 0;
125 { 125 {
126 int seq_limit = INT_MAX/SEQ_MULTIPLIER; 126 int seq_limit = INT_MAX/SEQ_MULTIPLIER;
127 if (seq_limit > USHORT_MAX) 127 if (seq_limit > USHRT_MAX)
128 ids->seq_max = USHORT_MAX; 128 ids->seq_max = USHRT_MAX;
129 else 129 else
130 ids->seq_max = seq_limit; 130 ids->seq_max = seq_limit;
131 } 131 }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 545777574779..124ad9d6be16 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -326,6 +326,12 @@ out_notify:
326int __cpuinit cpu_up(unsigned int cpu) 326int __cpuinit cpu_up(unsigned int cpu)
327{ 327{
328 int err = 0; 328 int err = 0;
329
330#ifdef CONFIG_MEMORY_HOTPLUG
331 int nid;
332 pg_data_t *pgdat;
333#endif
334
329 if (!cpu_possible(cpu)) { 335 if (!cpu_possible(cpu)) {
330 printk(KERN_ERR "can't online cpu %d because it is not " 336 printk(KERN_ERR "can't online cpu %d because it is not "
331 "configured as may-hotadd at boot time\n", cpu); 337 "configured as may-hotadd at boot time\n", cpu);
@@ -336,6 +342,28 @@ int __cpuinit cpu_up(unsigned int cpu)
336 return -EINVAL; 342 return -EINVAL;
337 } 343 }
338 344
345#ifdef CONFIG_MEMORY_HOTPLUG
346 nid = cpu_to_node(cpu);
347 if (!node_online(nid)) {
348 err = mem_online_node(nid);
349 if (err)
350 return err;
351 }
352
353 pgdat = NODE_DATA(nid);
354 if (!pgdat) {
355 printk(KERN_ERR
356 "Can't online cpu %d due to NULL pgdat\n", cpu);
357 return -ENOMEM;
358 }
359
360 if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
361 mutex_lock(&zonelists_mutex);
362 build_all_zonelists(NULL);
363 mutex_unlock(&zonelists_mutex);
364 }
365#endif
366
339 cpu_maps_update_begin(); 367 cpu_maps_update_begin();
340 368
341 if (cpu_hotplug_disabled) { 369 if (cpu_hotplug_disabled) {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9a50c5f6e727..61d6af7fa676 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -946,16 +946,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
946 * In order to avoid seeing no nodes if the old and new nodes are disjoint, 946 * In order to avoid seeing no nodes if the old and new nodes are disjoint,
947 * we structure updates as setting all new allowed nodes, then clearing newly 947 * we structure updates as setting all new allowed nodes, then clearing newly
948 * disallowed ones. 948 * disallowed ones.
949 *
950 * Called with task's alloc_lock held
951 */ 949 */
952static void cpuset_change_task_nodemask(struct task_struct *tsk, 950static void cpuset_change_task_nodemask(struct task_struct *tsk,
953 nodemask_t *newmems) 951 nodemask_t *newmems)
954{ 952{
953repeat:
954 /*
955 * Allow tasks that have access to memory reserves because they have
956 * been OOM killed to get memory anywhere.
957 */
958 if (unlikely(test_thread_flag(TIF_MEMDIE)))
959 return;
960 if (current->flags & PF_EXITING) /* Let dying task have memory */
961 return;
962
963 task_lock(tsk);
955 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 964 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
956 mpol_rebind_task(tsk, &tsk->mems_allowed); 965 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
957 mpol_rebind_task(tsk, newmems); 966
967
968 /*
969 * ensure checking ->mems_allowed_change_disable after setting all new
970 * allowed nodes.
971 *
972 * the read-side task can see an nodemask with new allowed nodes and
973 * old allowed nodes. and if it allocates page when cpuset clears newly
974 * disallowed ones continuous, it can see the new allowed bits.
975 *
976 * And if setting all new allowed nodes is after the checking, setting
977 * all new allowed nodes and clearing newly disallowed ones will be done
978 * continuous, and the read-side task may find no node to alloc page.
979 */
980 smp_mb();
981
982 /*
983 * Allocation of memory is very fast, we needn't sleep when waiting
984 * for the read-side.
985 */
986 while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
987 task_unlock(tsk);
988 if (!task_curr(tsk))
989 yield();
990 goto repeat;
991 }
992
993 /*
994 * ensure checking ->mems_allowed_change_disable before clearing all new
995 * disallowed nodes.
996 *
997 * if clearing newly disallowed bits before the checking, the read-side
998 * task may find no node to alloc page.
999 */
1000 smp_mb();
1001
1002 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
958 tsk->mems_allowed = *newmems; 1003 tsk->mems_allowed = *newmems;
1004 task_unlock(tsk);
959} 1005}
960 1006
961/* 1007/*
@@ -978,9 +1024,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
978 cs = cgroup_cs(scan->cg); 1024 cs = cgroup_cs(scan->cg);
979 guarantee_online_mems(cs, newmems); 1025 guarantee_online_mems(cs, newmems);
980 1026
981 task_lock(p);
982 cpuset_change_task_nodemask(p, newmems); 1027 cpuset_change_task_nodemask(p, newmems);
983 task_unlock(p);
984 1028
985 NODEMASK_FREE(newmems); 1029 NODEMASK_FREE(newmems);
986 1030
@@ -1383,9 +1427,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1383 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1427 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1384 WARN_ON_ONCE(err); 1428 WARN_ON_ONCE(err);
1385 1429
1386 task_lock(tsk);
1387 cpuset_change_task_nodemask(tsk, to); 1430 cpuset_change_task_nodemask(tsk, to);
1388 task_unlock(tsk);
1389 cpuset_update_task_spread_flag(cs, tsk); 1431 cpuset_update_task_spread_flag(cs, tsk);
1390 1432
1391} 1433}
diff --git a/kernel/exit.c b/kernel/exit.c
index eabca5a73a85..019a2843bf95 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1002,8 +1002,10 @@ NORET_TYPE void do_exit(long code)
1002 1002
1003 exit_notify(tsk, group_dead); 1003 exit_notify(tsk, group_dead);
1004#ifdef CONFIG_NUMA 1004#ifdef CONFIG_NUMA
1005 task_lock(tsk);
1005 mpol_put(tsk->mempolicy); 1006 mpol_put(tsk->mempolicy);
1006 tsk->mempolicy = NULL; 1007 tsk->mempolicy = NULL;
1008 task_unlock(tsk);
1007#endif 1009#endif
1008#ifdef CONFIG_FUTEX 1010#ifdef CONFIG_FUTEX
1009 if (unlikely(current->pi_state_cache)) 1011 if (unlikely(current->pi_state_cache))
diff --git a/kernel/module.c b/kernel/module.c
index a8014bfb5a4e..625985e70e9d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -180,8 +180,6 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
180extern const struct kernel_symbol __stop___ksymtab_gpl[]; 180extern const struct kernel_symbol __stop___ksymtab_gpl[];
181extern const struct kernel_symbol __start___ksymtab_gpl_future[]; 181extern const struct kernel_symbol __start___ksymtab_gpl_future[];
182extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; 182extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
183extern const struct kernel_symbol __start___ksymtab_gpl_future[];
184extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
185extern const unsigned long __start___kcrctab[]; 183extern const unsigned long __start___kcrctab[];
186extern const unsigned long __start___kcrctab_gpl[]; 184extern const unsigned long __start___kcrctab_gpl[];
187extern const unsigned long __start___kcrctab_gpl_future[]; 185extern const unsigned long __start___kcrctab_gpl_future[];
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4c93486b45d1..84ff5e75c084 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -37,6 +37,7 @@
37#include <linux/highuid.h> 37#include <linux/highuid.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/ratelimit.h> 39#include <linux/ratelimit.h>
40#include <linux/compaction.h>
40#include <linux/hugetlb.h> 41#include <linux/hugetlb.h>
41#include <linux/initrd.h> 42#include <linux/initrd.h>
42#include <linux/key.h> 43#include <linux/key.h>
@@ -262,6 +263,11 @@ static int min_sched_shares_ratelimit = 100000; /* 100 usec */
262static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ 263static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
263#endif 264#endif
264 265
266#ifdef CONFIG_COMPACTION
267static int min_extfrag_threshold;
268static int max_extfrag_threshold = 1000;
269#endif
270
265static struct ctl_table kern_table[] = { 271static struct ctl_table kern_table[] = {
266 { 272 {
267 .procname = "sched_child_runs_first", 273 .procname = "sched_child_runs_first",
@@ -1121,6 +1127,25 @@ static struct ctl_table vm_table[] = {
1121 .mode = 0644, 1127 .mode = 0644,
1122 .proc_handler = drop_caches_sysctl_handler, 1128 .proc_handler = drop_caches_sysctl_handler,
1123 }, 1129 },
1130#ifdef CONFIG_COMPACTION
1131 {
1132 .procname = "compact_memory",
1133 .data = &sysctl_compact_memory,
1134 .maxlen = sizeof(int),
1135 .mode = 0200,
1136 .proc_handler = sysctl_compaction_handler,
1137 },
1138 {
1139 .procname = "extfrag_threshold",
1140 .data = &sysctl_extfrag_threshold,
1141 .maxlen = sizeof(int),
1142 .mode = 0644,
1143 .proc_handler = sysctl_extfrag_handler,
1144 .extra1 = &min_extfrag_threshold,
1145 .extra2 = &max_extfrag_threshold,
1146 },
1147
1148#endif /* CONFIG_COMPACTION */
1124 { 1149 {
1125 .procname = "min_free_kbytes", 1150 .procname = "min_free_kbytes",
1126 .data = &min_free_kbytes, 1151 .data = &min_free_kbytes,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 937d31dc8566..1357c5786064 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -13,6 +13,7 @@
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/netdevice.h> 15#include <linux/netdevice.h>
16#include <linux/kernel.h>
16#include <linux/slab.h> 17#include <linux/slab.h>
17 18
18#ifdef CONFIG_SYSCTL_SYSCALL 19#ifdef CONFIG_SYSCTL_SYSCALL
@@ -1124,11 +1125,6 @@ out:
1124 return result; 1125 return result;
1125} 1126}
1126 1127
1127static unsigned hex_value(int ch)
1128{
1129 return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10;
1130}
1131
1132static ssize_t bin_uuid(struct file *file, 1128static ssize_t bin_uuid(struct file *file,
1133 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1129 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1134{ 1130{
@@ -1156,7 +1152,8 @@ static ssize_t bin_uuid(struct file *file,
1156 if (!isxdigit(str[0]) || !isxdigit(str[1])) 1152 if (!isxdigit(str[0]) || !isxdigit(str[1]))
1157 goto out; 1153 goto out;
1158 1154
1159 uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]); 1155 uuid[i] = (hex_to_bin(str[0]) << 4) |
1156 hex_to_bin(str[1]);
1160 str += 2; 1157 str += 2;
1161 if (*str == '-') 1158 if (*str == '-')
1162 str++; 1159 str++;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d85be90d5888..231208948363 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1039,10 +1039,10 @@ config DYNAMIC_DEBUG
1039 1039
1040 Usage: 1040 Usage:
1041 1041
1042 Dynamic debugging is controlled via the 'dynamic_debug/ddebug' file, 1042 Dynamic debugging is controlled via the 'dynamic_debug/control' file,
1043 which is contained in the 'debugfs' filesystem. Thus, the debugfs 1043 which is contained in the 'debugfs' filesystem. Thus, the debugfs
1044 filesystem must first be mounted before making use of this feature. 1044 filesystem must first be mounted before making use of this feature.
1045 We refer the control file as: <debugfs>/dynamic_debug/ddebug. This 1045 We refer the control file as: <debugfs>/dynamic_debug/control. This
1046 file contains a list of the debug statements that can be enabled. The 1046 file contains a list of the debug statements that can be enabled. The
1047 format for each line of the file is: 1047 format for each line of the file is:
1048 1048
@@ -1057,7 +1057,7 @@ config DYNAMIC_DEBUG
1057 1057
1058 From a live system: 1058 From a live system:
1059 1059
1060 nullarbor:~ # cat <debugfs>/dynamic_debug/ddebug 1060 nullarbor:~ # cat <debugfs>/dynamic_debug/control
1061 # filename:lineno [module]function flags format 1061 # filename:lineno [module]function flags format
1062 fs/aio.c:222 [aio]__put_ioctx - "__put_ioctx:\040freeing\040%p\012" 1062 fs/aio.c:222 [aio]__put_ioctx - "__put_ioctx:\040freeing\040%p\012"
1063 fs/aio.c:248 [aio]ioctx_alloc - "ENOMEM:\040nr_events\040too\040high\012" 1063 fs/aio.c:248 [aio]ioctx_alloc - "ENOMEM:\040nr_events\040too\040high\012"
@@ -1067,23 +1067,23 @@ config DYNAMIC_DEBUG
1067 1067
1068 // enable the message at line 1603 of file svcsock.c 1068 // enable the message at line 1603 of file svcsock.c
1069 nullarbor:~ # echo -n 'file svcsock.c line 1603 +p' > 1069 nullarbor:~ # echo -n 'file svcsock.c line 1603 +p' >
1070 <debugfs>/dynamic_debug/ddebug 1070 <debugfs>/dynamic_debug/control
1071 1071
1072 // enable all the messages in file svcsock.c 1072 // enable all the messages in file svcsock.c
1073 nullarbor:~ # echo -n 'file svcsock.c +p' > 1073 nullarbor:~ # echo -n 'file svcsock.c +p' >
1074 <debugfs>/dynamic_debug/ddebug 1074 <debugfs>/dynamic_debug/control
1075 1075
1076 // enable all the messages in the NFS server module 1076 // enable all the messages in the NFS server module
1077 nullarbor:~ # echo -n 'module nfsd +p' > 1077 nullarbor:~ # echo -n 'module nfsd +p' >
1078 <debugfs>/dynamic_debug/ddebug 1078 <debugfs>/dynamic_debug/control
1079 1079
1080 // enable all 12 messages in the function svc_process() 1080 // enable all 12 messages in the function svc_process()
1081 nullarbor:~ # echo -n 'func svc_process +p' > 1081 nullarbor:~ # echo -n 'func svc_process +p' >
1082 <debugfs>/dynamic_debug/ddebug 1082 <debugfs>/dynamic_debug/control
1083 1083
1084 // disable all 12 messages in the function svc_process() 1084 // disable all 12 messages in the function svc_process()
1085 nullarbor:~ # echo -n 'func svc_process -p' > 1085 nullarbor:~ # echo -n 'func svc_process -p' >
1086 <debugfs>/dynamic_debug/ddebug 1086 <debugfs>/dynamic_debug/control
1087 1087
1088 See Documentation/dynamic-debug-howto.txt for additional information. 1088 See Documentation/dynamic-debug-howto.txt for additional information.
1089 1089
diff --git a/lib/crc32.c b/lib/crc32.c
index bc5b936e9142..3087ed899ee3 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -48,12 +48,20 @@ MODULE_LICENSE("GPL");
48#if CRC_LE_BITS == 8 || CRC_BE_BITS == 8 48#if CRC_LE_BITS == 8 || CRC_BE_BITS == 8
49 49
50static inline u32 50static inline u32
51crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 *tab) 51crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
52{ 52{
53# ifdef __LITTLE_ENDIAN 53# if __BYTE_ORDER == __LITTLE_ENDIAN
54# define DO_CRC(x) crc = tab[(crc ^ (x)) & 255 ] ^ (crc >> 8) 54# define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8)
55# define DO_CRC4 crc = tab[3][(crc) & 255] ^ \
56 tab[2][(crc >> 8) & 255] ^ \
57 tab[1][(crc >> 16) & 255] ^ \
58 tab[0][(crc >> 24) & 255]
55# else 59# else
56# define DO_CRC(x) crc = tab[((crc >> 24) ^ (x)) & 255] ^ (crc << 8) 60# define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
61# define DO_CRC4 crc = tab[0][(crc) & 255] ^ \
62 tab[1][(crc >> 8) & 255] ^ \
63 tab[2][(crc >> 16) & 255] ^ \
64 tab[3][(crc >> 24) & 255]
57# endif 65# endif
58 const u32 *b; 66 const u32 *b;
59 size_t rem_len; 67 size_t rem_len;
@@ -70,10 +78,7 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 *tab)
70 b = (const u32 *)buf; 78 b = (const u32 *)buf;
71 for (--b; len; --len) { 79 for (--b; len; --len) {
72 crc ^= *++b; /* use pre increment for speed */ 80 crc ^= *++b; /* use pre increment for speed */
73 DO_CRC(0); 81 DO_CRC4;
74 DO_CRC(0);
75 DO_CRC(0);
76 DO_CRC(0);
77 } 82 }
78 len = rem_len; 83 len = rem_len;
79 /* And the last few bytes */ 84 /* And the last few bytes */
@@ -85,6 +90,7 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 *tab)
85 } 90 }
86 return crc; 91 return crc;
87#undef DO_CRC 92#undef DO_CRC
93#undef DO_CRC4
88} 94}
89#endif 95#endif
90/** 96/**
@@ -117,7 +123,7 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
117u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) 123u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
118{ 124{
119# if CRC_LE_BITS == 8 125# if CRC_LE_BITS == 8
120 const u32 *tab = crc32table_le; 126 const u32 (*tab)[] = crc32table_le;
121 127
122 crc = __cpu_to_le32(crc); 128 crc = __cpu_to_le32(crc);
123 crc = crc32_body(crc, p, len, tab); 129 crc = crc32_body(crc, p, len, tab);
@@ -174,7 +180,7 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
174u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) 180u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
175{ 181{
176# if CRC_BE_BITS == 8 182# if CRC_BE_BITS == 8
177 const u32 *tab = crc32table_be; 183 const u32 (*tab)[] = crc32table_be;
178 184
179 crc = __cpu_to_be32(crc); 185 crc = __cpu_to_be32(crc);
180 crc = crc32_body(crc, p, len, tab); 186 crc = crc32_body(crc, p, len, tab);
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index d6b8b9b1abfe..3df8eb17a607 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -456,7 +456,7 @@ static ssize_t ddebug_proc_write(struct file *file, const char __user *ubuf,
456 __func__, (int)len); 456 __func__, (int)len);
457 457
458 nwords = ddebug_tokenize(tmpbuf, words, MAXWORDS); 458 nwords = ddebug_tokenize(tmpbuf, words, MAXWORDS);
459 if (nwords < 0) 459 if (nwords <= 0)
460 return -EINVAL; 460 return -EINVAL;
461 if (ddebug_parse_query(words, nwords-1, &query)) 461 if (ddebug_parse_query(words, nwords-1, &query))
462 return -EINVAL; 462 return -EINVAL;
diff --git a/lib/gen_crc32table.c b/lib/gen_crc32table.c
index bea5d97df991..85d0e412a04f 100644
--- a/lib/gen_crc32table.c
+++ b/lib/gen_crc32table.c
@@ -7,8 +7,8 @@
7#define LE_TABLE_SIZE (1 << CRC_LE_BITS) 7#define LE_TABLE_SIZE (1 << CRC_LE_BITS)
8#define BE_TABLE_SIZE (1 << CRC_BE_BITS) 8#define BE_TABLE_SIZE (1 << CRC_BE_BITS)
9 9
10static uint32_t crc32table_le[LE_TABLE_SIZE]; 10static uint32_t crc32table_le[4][LE_TABLE_SIZE];
11static uint32_t crc32table_be[BE_TABLE_SIZE]; 11static uint32_t crc32table_be[4][BE_TABLE_SIZE];
12 12
13/** 13/**
14 * crc32init_le() - allocate and initialize LE table data 14 * crc32init_le() - allocate and initialize LE table data
@@ -22,12 +22,19 @@ static void crc32init_le(void)
22 unsigned i, j; 22 unsigned i, j;
23 uint32_t crc = 1; 23 uint32_t crc = 1;
24 24
25 crc32table_le[0] = 0; 25 crc32table_le[0][0] = 0;
26 26
27 for (i = 1 << (CRC_LE_BITS - 1); i; i >>= 1) { 27 for (i = 1 << (CRC_LE_BITS - 1); i; i >>= 1) {
28 crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0); 28 crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0);
29 for (j = 0; j < LE_TABLE_SIZE; j += 2 * i) 29 for (j = 0; j < LE_TABLE_SIZE; j += 2 * i)
30 crc32table_le[i + j] = crc ^ crc32table_le[j]; 30 crc32table_le[0][i + j] = crc ^ crc32table_le[0][j];
31 }
32 for (i = 0; i < LE_TABLE_SIZE; i++) {
33 crc = crc32table_le[0][i];
34 for (j = 1; j < 4; j++) {
35 crc = crc32table_le[0][crc & 0xff] ^ (crc >> 8);
36 crc32table_le[j][i] = crc;
37 }
31 } 38 }
32} 39}
33 40
@@ -39,25 +46,35 @@ static void crc32init_be(void)
39 unsigned i, j; 46 unsigned i, j;
40 uint32_t crc = 0x80000000; 47 uint32_t crc = 0x80000000;
41 48
42 crc32table_be[0] = 0; 49 crc32table_be[0][0] = 0;
43 50
44 for (i = 1; i < BE_TABLE_SIZE; i <<= 1) { 51 for (i = 1; i < BE_TABLE_SIZE; i <<= 1) {
45 crc = (crc << 1) ^ ((crc & 0x80000000) ? CRCPOLY_BE : 0); 52 crc = (crc << 1) ^ ((crc & 0x80000000) ? CRCPOLY_BE : 0);
46 for (j = 0; j < i; j++) 53 for (j = 0; j < i; j++)
47 crc32table_be[i + j] = crc ^ crc32table_be[j]; 54 crc32table_be[0][i + j] = crc ^ crc32table_be[0][j];
55 }
56 for (i = 0; i < BE_TABLE_SIZE; i++) {
57 crc = crc32table_be[0][i];
58 for (j = 1; j < 4; j++) {
59 crc = crc32table_be[0][(crc >> 24) & 0xff] ^ (crc << 8);
60 crc32table_be[j][i] = crc;
61 }
48 } 62 }
49} 63}
50 64
51static void output_table(uint32_t table[], int len, char *trans) 65static void output_table(uint32_t table[4][256], int len, char *trans)
52{ 66{
53 int i; 67 int i, j;
54 68
55 for (i = 0; i < len - 1; i++) { 69 for (j = 0 ; j < 4; j++) {
56 if (i % ENTRIES_PER_LINE == 0) 70 printf("{");
57 printf("\n"); 71 for (i = 0; i < len - 1; i++) {
58 printf("%s(0x%8.8xL), ", trans, table[i]); 72 if (i % ENTRIES_PER_LINE == 0)
73 printf("\n");
74 printf("%s(0x%8.8xL), ", trans, table[j][i]);
75 }
76 printf("%s(0x%8.8xL)},\n", trans, table[j][len - 1]);
59 } 77 }
60 printf("%s(0x%8.8xL)\n", trans, table[len - 1]);
61} 78}
62 79
63int main(int argc, char** argv) 80int main(int argc, char** argv)
@@ -66,14 +83,14 @@ int main(int argc, char** argv)
66 83
67 if (CRC_LE_BITS > 1) { 84 if (CRC_LE_BITS > 1) {
68 crc32init_le(); 85 crc32init_le();
69 printf("static const u32 crc32table_le[] = {"); 86 printf("static const u32 crc32table_le[4][256] = {");
70 output_table(crc32table_le, LE_TABLE_SIZE, "tole"); 87 output_table(crc32table_le, LE_TABLE_SIZE, "tole");
71 printf("};\n"); 88 printf("};\n");
72 } 89 }
73 90
74 if (CRC_BE_BITS > 1) { 91 if (CRC_BE_BITS > 1) {
75 crc32init_be(); 92 crc32init_be();
76 printf("static const u32 crc32table_be[] = {"); 93 printf("static const u32 crc32table_be[4][256] = {");
77 output_table(crc32table_be, BE_TABLE_SIZE, "tobe"); 94 output_table(crc32table_be, BE_TABLE_SIZE, "tobe");
78 printf("};\n"); 95 printf("};\n");
79 } 96 }
diff --git a/lib/hexdump.c b/lib/hexdump.c
index 39af2560f765..5d7a4802c562 100644
--- a/lib/hexdump.c
+++ b/lib/hexdump.c
@@ -16,6 +16,24 @@ const char hex_asc[] = "0123456789abcdef";
16EXPORT_SYMBOL(hex_asc); 16EXPORT_SYMBOL(hex_asc);
17 17
18/** 18/**
19 * hex_to_bin - convert a hex digit to its real value
20 * @ch: ascii character represents hex digit
21 *
22 * hex_to_bin() converts one hex digit to its actual value or -1 in case of bad
23 * input.
24 */
25int hex_to_bin(char ch)
26{
27 if ((ch >= '0') && (ch <= '9'))
28 return ch - '0';
29 ch = tolower(ch);
30 if ((ch >= 'a') && (ch <= 'f'))
31 return ch - 'a' + 10;
32 return -1;
33}
34EXPORT_SYMBOL(hex_to_bin);
35
36/**
19 * hex_dump_to_buffer - convert a blob of data to "hex ASCII" in memory 37 * hex_dump_to_buffer - convert a blob of data to "hex ASCII" in memory
20 * @buf: data blob to dump 38 * @buf: data blob to dump
21 * @len: number of bytes in the @buf 39 * @len: number of bytes in the @buf
@@ -34,7 +52,7 @@ EXPORT_SYMBOL(hex_asc);
34 * 52 *
35 * E.g.: 53 * E.g.:
36 * hex_dump_to_buffer(frame->data, frame->len, 16, 1, 54 * hex_dump_to_buffer(frame->data, frame->len, 16, 1,
37 * linebuf, sizeof(linebuf), 1); 55 * linebuf, sizeof(linebuf), true);
38 * 56 *
39 * example output buffer: 57 * example output buffer:
40 * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO 58 * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO
@@ -65,8 +83,8 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
65 83
66 for (j = 0; j < ngroups; j++) 84 for (j = 0; j < ngroups; j++)
67 lx += scnprintf(linebuf + lx, linebuflen - lx, 85 lx += scnprintf(linebuf + lx, linebuflen - lx,
68 "%s%16.16llx", j ? " " : "", 86 "%s%16.16llx", j ? " " : "",
69 (unsigned long long)*(ptr8 + j)); 87 (unsigned long long)*(ptr8 + j));
70 ascii_column = 17 * ngroups + 2; 88 ascii_column = 17 * ngroups + 2;
71 break; 89 break;
72 } 90 }
@@ -77,7 +95,7 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
77 95
78 for (j = 0; j < ngroups; j++) 96 for (j = 0; j < ngroups; j++)
79 lx += scnprintf(linebuf + lx, linebuflen - lx, 97 lx += scnprintf(linebuf + lx, linebuflen - lx,
80 "%s%8.8x", j ? " " : "", *(ptr4 + j)); 98 "%s%8.8x", j ? " " : "", *(ptr4 + j));
81 ascii_column = 9 * ngroups + 2; 99 ascii_column = 9 * ngroups + 2;
82 break; 100 break;
83 } 101 }
@@ -88,7 +106,7 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
88 106
89 for (j = 0; j < ngroups; j++) 107 for (j = 0; j < ngroups; j++)
90 lx += scnprintf(linebuf + lx, linebuflen - lx, 108 lx += scnprintf(linebuf + lx, linebuflen - lx,
91 "%s%4.4x", j ? " " : "", *(ptr2 + j)); 109 "%s%4.4x", j ? " " : "", *(ptr2 + j));
92 ascii_column = 5 * ngroups + 2; 110 ascii_column = 5 * ngroups + 2;
93 break; 111 break;
94 } 112 }
@@ -111,9 +129,10 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
111 129
112 while (lx < (linebuflen - 1) && lx < (ascii_column - 1)) 130 while (lx < (linebuflen - 1) && lx < (ascii_column - 1))
113 linebuf[lx++] = ' '; 131 linebuf[lx++] = ' ';
114 for (j = 0; (j < len) && (lx + 2) < linebuflen; j++) 132 for (j = 0; (j < len) && (lx + 2) < linebuflen; j++) {
115 linebuf[lx++] = (isascii(ptr[j]) && isprint(ptr[j])) ? ptr[j] 133 ch = ptr[j];
116 : '.'; 134 linebuf[lx++] = (isascii(ch) && isprint(ch)) ? ch : '.';
135 }
117nil: 136nil:
118 linebuf[lx++] = '\0'; 137 linebuf[lx++] = '\0';
119} 138}
@@ -143,7 +162,7 @@ EXPORT_SYMBOL(hex_dump_to_buffer);
143 * 162 *
144 * E.g.: 163 * E.g.:
145 * print_hex_dump(KERN_DEBUG, "raw data: ", DUMP_PREFIX_ADDRESS, 164 * print_hex_dump(KERN_DEBUG, "raw data: ", DUMP_PREFIX_ADDRESS,
146 * 16, 1, frame->data, frame->len, 1); 165 * 16, 1, frame->data, frame->len, true);
147 * 166 *
148 * Example output using %DUMP_PREFIX_OFFSET and 1-byte mode: 167 * Example output using %DUMP_PREFIX_OFFSET and 1-byte mode:
149 * 0009ab42: 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO 168 * 0009ab42: 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO
@@ -151,12 +170,12 @@ EXPORT_SYMBOL(hex_dump_to_buffer);
151 * ffffffff88089af0: 73727170 77767574 7b7a7978 7f7e7d7c pqrstuvwxyz{|}~. 170 * ffffffff88089af0: 73727170 77767574 7b7a7978 7f7e7d7c pqrstuvwxyz{|}~.
152 */ 171 */
153void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, 172void print_hex_dump(const char *level, const char *prefix_str, int prefix_type,
154 int rowsize, int groupsize, 173 int rowsize, int groupsize,
155 const void *buf, size_t len, bool ascii) 174 const void *buf, size_t len, bool ascii)
156{ 175{
157 const u8 *ptr = buf; 176 const u8 *ptr = buf;
158 int i, linelen, remaining = len; 177 int i, linelen, remaining = len;
159 unsigned char linebuf[200]; 178 unsigned char linebuf[32 * 3 + 2 + 32 + 1];
160 179
161 if (rowsize != 16 && rowsize != 32) 180 if (rowsize != 16 && rowsize != 32)
162 rowsize = 16; 181 rowsize = 16;
@@ -164,13 +183,14 @@ void print_hex_dump(const char *level, const char *prefix_str, int prefix_type,
164 for (i = 0; i < len; i += rowsize) { 183 for (i = 0; i < len; i += rowsize) {
165 linelen = min(remaining, rowsize); 184 linelen = min(remaining, rowsize);
166 remaining -= rowsize; 185 remaining -= rowsize;
186
167 hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, 187 hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
168 linebuf, sizeof(linebuf), ascii); 188 linebuf, sizeof(linebuf), ascii);
169 189
170 switch (prefix_type) { 190 switch (prefix_type) {
171 case DUMP_PREFIX_ADDRESS: 191 case DUMP_PREFIX_ADDRESS:
172 printk("%s%s%*p: %s\n", level, prefix_str, 192 printk("%s%s%p: %s\n",
173 (int)(2 * sizeof(void *)), ptr + i, linebuf); 193 level, prefix_str, ptr + i, linebuf);
174 break; 194 break;
175 case DUMP_PREFIX_OFFSET: 195 case DUMP_PREFIX_OFFSET:
176 printk("%s%s%.8x: %s\n", level, prefix_str, i, linebuf); 196 printk("%s%s%.8x: %s\n", level, prefix_str, i, linebuf);
@@ -196,9 +216,9 @@ EXPORT_SYMBOL(print_hex_dump);
196 * rowsize of 16, groupsize of 1, and ASCII output included. 216 * rowsize of 16, groupsize of 1, and ASCII output included.
197 */ 217 */
198void print_hex_dump_bytes(const char *prefix_str, int prefix_type, 218void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
199 const void *buf, size_t len) 219 const void *buf, size_t len)
200{ 220{
201 print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, 16, 1, 221 print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, 16, 1,
202 buf, len, 1); 222 buf, len, true);
203} 223}
204EXPORT_SYMBOL(print_hex_dump_bytes); 224EXPORT_SYMBOL(print_hex_dump_bytes);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 46d34b0b74a8..b8a2f549ab0e 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -267,7 +267,8 @@ int strict_strtoll(const char *cp, unsigned int base, long long *res)
267} 267}
268EXPORT_SYMBOL(strict_strtoll); 268EXPORT_SYMBOL(strict_strtoll);
269 269
270static int skip_atoi(const char **s) 270static noinline_for_stack
271int skip_atoi(const char **s)
271{ 272{
272 int i = 0; 273 int i = 0;
273 274
@@ -287,7 +288,8 @@ static int skip_atoi(const char **s)
287/* Formats correctly any integer in [0,99999]. 288/* Formats correctly any integer in [0,99999].
288 * Outputs from one to five digits depending on input. 289 * Outputs from one to five digits depending on input.
289 * On i386 gcc 4.1.2 -O2: ~250 bytes of code. */ 290 * On i386 gcc 4.1.2 -O2: ~250 bytes of code. */
290static char *put_dec_trunc(char *buf, unsigned q) 291static noinline_for_stack
292char *put_dec_trunc(char *buf, unsigned q)
291{ 293{
292 unsigned d3, d2, d1, d0; 294 unsigned d3, d2, d1, d0;
293 d1 = (q>>4) & 0xf; 295 d1 = (q>>4) & 0xf;
@@ -324,7 +326,8 @@ static char *put_dec_trunc(char *buf, unsigned q)
324 return buf; 326 return buf;
325} 327}
326/* Same with if's removed. Always emits five digits */ 328/* Same with if's removed. Always emits five digits */
327static char *put_dec_full(char *buf, unsigned q) 329static noinline_for_stack
330char *put_dec_full(char *buf, unsigned q)
328{ 331{
329 /* BTW, if q is in [0,9999], 8-bit ints will be enough, */ 332 /* BTW, if q is in [0,9999], 8-bit ints will be enough, */
330 /* but anyway, gcc produces better code with full-sized ints */ 333 /* but anyway, gcc produces better code with full-sized ints */
@@ -366,7 +369,8 @@ static char *put_dec_full(char *buf, unsigned q)
366 return buf; 369 return buf;
367} 370}
368/* No inlining helps gcc to use registers better */ 371/* No inlining helps gcc to use registers better */
369static noinline char *put_dec(char *buf, unsigned long long num) 372static noinline_for_stack
373char *put_dec(char *buf, unsigned long long num)
370{ 374{
371 while (1) { 375 while (1) {
372 unsigned rem; 376 unsigned rem;
@@ -417,8 +421,9 @@ struct printf_spec {
417 s16 precision; /* # of digits/chars */ 421 s16 precision; /* # of digits/chars */
418}; 422};
419 423
420static char *number(char *buf, char *end, unsigned long long num, 424static noinline_for_stack
421 struct printf_spec spec) 425char *number(char *buf, char *end, unsigned long long num,
426 struct printf_spec spec)
422{ 427{
423 /* we are called with base 8, 10 or 16, only, thus don't need "G..." */ 428 /* we are called with base 8, 10 or 16, only, thus don't need "G..." */
424 static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */ 429 static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */
@@ -537,7 +542,8 @@ static char *number(char *buf, char *end, unsigned long long num,
537 return buf; 542 return buf;
538} 543}
539 544
540static char *string(char *buf, char *end, const char *s, struct printf_spec spec) 545static noinline_for_stack
546char *string(char *buf, char *end, const char *s, struct printf_spec spec)
541{ 547{
542 int len, i; 548 int len, i;
543 549
@@ -567,8 +573,9 @@ static char *string(char *buf, char *end, const char *s, struct printf_spec spec
567 return buf; 573 return buf;
568} 574}
569 575
570static char *symbol_string(char *buf, char *end, void *ptr, 576static noinline_for_stack
571 struct printf_spec spec, char ext) 577char *symbol_string(char *buf, char *end, void *ptr,
578 struct printf_spec spec, char ext)
572{ 579{
573 unsigned long value = (unsigned long) ptr; 580 unsigned long value = (unsigned long) ptr;
574#ifdef CONFIG_KALLSYMS 581#ifdef CONFIG_KALLSYMS
@@ -588,8 +595,9 @@ static char *symbol_string(char *buf, char *end, void *ptr,
588#endif 595#endif
589} 596}
590 597
591static char *resource_string(char *buf, char *end, struct resource *res, 598static noinline_for_stack
592 struct printf_spec spec, const char *fmt) 599char *resource_string(char *buf, char *end, struct resource *res,
600 struct printf_spec spec, const char *fmt)
593{ 601{
594#ifndef IO_RSRC_PRINTK_SIZE 602#ifndef IO_RSRC_PRINTK_SIZE
595#define IO_RSRC_PRINTK_SIZE 6 603#define IO_RSRC_PRINTK_SIZE 6
@@ -690,8 +698,9 @@ static char *resource_string(char *buf, char *end, struct resource *res,
690 return string(buf, end, sym, spec); 698 return string(buf, end, sym, spec);
691} 699}
692 700
693static char *mac_address_string(char *buf, char *end, u8 *addr, 701static noinline_for_stack
694 struct printf_spec spec, const char *fmt) 702char *mac_address_string(char *buf, char *end, u8 *addr,
703 struct printf_spec spec, const char *fmt)
695{ 704{
696 char mac_addr[sizeof("xx:xx:xx:xx:xx:xx")]; 705 char mac_addr[sizeof("xx:xx:xx:xx:xx:xx")];
697 char *p = mac_addr; 706 char *p = mac_addr;
@@ -714,7 +723,8 @@ static char *mac_address_string(char *buf, char *end, u8 *addr,
714 return string(buf, end, mac_addr, spec); 723 return string(buf, end, mac_addr, spec);
715} 724}
716 725
717static char *ip4_string(char *p, const u8 *addr, const char *fmt) 726static noinline_for_stack
727char *ip4_string(char *p, const u8 *addr, const char *fmt)
718{ 728{
719 int i; 729 int i;
720 bool leading_zeros = (fmt[0] == 'i'); 730 bool leading_zeros = (fmt[0] == 'i');
@@ -763,7 +773,8 @@ static char *ip4_string(char *p, const u8 *addr, const char *fmt)
763 return p; 773 return p;
764} 774}
765 775
766static char *ip6_compressed_string(char *p, const char *addr) 776static noinline_for_stack
777char *ip6_compressed_string(char *p, const char *addr)
767{ 778{
768 int i, j, range; 779 int i, j, range;
769 unsigned char zerolength[8]; 780 unsigned char zerolength[8];
@@ -843,7 +854,8 @@ static char *ip6_compressed_string(char *p, const char *addr)
843 return p; 854 return p;
844} 855}
845 856
846static char *ip6_string(char *p, const char *addr, const char *fmt) 857static noinline_for_stack
858char *ip6_string(char *p, const char *addr, const char *fmt)
847{ 859{
848 int i; 860 int i;
849 861
@@ -858,8 +870,9 @@ static char *ip6_string(char *p, const char *addr, const char *fmt)
858 return p; 870 return p;
859} 871}
860 872
861static char *ip6_addr_string(char *buf, char *end, const u8 *addr, 873static noinline_for_stack
862 struct printf_spec spec, const char *fmt) 874char *ip6_addr_string(char *buf, char *end, const u8 *addr,
875 struct printf_spec spec, const char *fmt)
863{ 876{
864 char ip6_addr[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255")]; 877 char ip6_addr[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255")];
865 878
@@ -871,8 +884,9 @@ static char *ip6_addr_string(char *buf, char *end, const u8 *addr,
871 return string(buf, end, ip6_addr, spec); 884 return string(buf, end, ip6_addr, spec);
872} 885}
873 886
874static char *ip4_addr_string(char *buf, char *end, const u8 *addr, 887static noinline_for_stack
875 struct printf_spec spec, const char *fmt) 888char *ip4_addr_string(char *buf, char *end, const u8 *addr,
889 struct printf_spec spec, const char *fmt)
876{ 890{
877 char ip4_addr[sizeof("255.255.255.255")]; 891 char ip4_addr[sizeof("255.255.255.255")];
878 892
@@ -881,8 +895,9 @@ static char *ip4_addr_string(char *buf, char *end, const u8 *addr,
881 return string(buf, end, ip4_addr, spec); 895 return string(buf, end, ip4_addr, spec);
882} 896}
883 897
884static char *uuid_string(char *buf, char *end, const u8 *addr, 898static noinline_for_stack
885 struct printf_spec spec, const char *fmt) 899char *uuid_string(char *buf, char *end, const u8 *addr,
900 struct printf_spec spec, const char *fmt)
886{ 901{
887 char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; 902 char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")];
888 char *p = uuid; 903 char *p = uuid;
@@ -970,8 +985,9 @@ static char *uuid_string(char *buf, char *end, const u8 *addr,
970 * function pointers are really function descriptors, which contain a 985 * function pointers are really function descriptors, which contain a
971 * pointer to the real address. 986 * pointer to the real address.
972 */ 987 */
973static char *pointer(const char *fmt, char *buf, char *end, void *ptr, 988static noinline_for_stack
974 struct printf_spec spec) 989char *pointer(const char *fmt, char *buf, char *end, void *ptr,
990 struct printf_spec spec)
975{ 991{
976 if (!ptr) 992 if (!ptr)
977 return string(buf, end, "(null)", spec); 993 return string(buf, end, "(null)", spec);
@@ -1040,7 +1056,8 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr,
1040 * @precision: precision of a number 1056 * @precision: precision of a number
1041 * @qualifier: qualifier of a number (long, size_t, ...) 1057 * @qualifier: qualifier of a number (long, size_t, ...)
1042 */ 1058 */
1043static int format_decode(const char *fmt, struct printf_spec *spec) 1059static noinline_for_stack
1060int format_decode(const char *fmt, struct printf_spec *spec)
1044{ 1061{
1045 const char *start = fmt; 1062 const char *start = fmt;
1046 1063
@@ -1980,7 +1997,7 @@ int vsscanf(const char *buf, const char *fmt, va_list args)
1980 { 1997 {
1981 char *s = (char *)va_arg(args, char *); 1998 char *s = (char *)va_arg(args, char *);
1982 if (field_width == -1) 1999 if (field_width == -1)
1983 field_width = SHORT_MAX; 2000 field_width = SHRT_MAX;
1984 /* first, skip leading white space in buffer */ 2001 /* first, skip leading white space in buffer */
1985 str = skip_spaces(str); 2002 str = skip_spaces(str);
1986 2003
diff --git a/mm/Kconfig b/mm/Kconfig
index 9c61158308dc..527136b22384 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -172,6 +172,15 @@ config SPLIT_PTLOCK_CPUS
172 default "4" 172 default "4"
173 173
174# 174#
175# support for memory compaction
176config COMPACTION
177 bool "Allow for memory compaction"
178 select MIGRATION
179 depends on EXPERIMENTAL && HUGETLB_PAGE && MMU
180 help
181 Allows the compaction of memory for the allocation of huge pages.
182
183#
175# support for page migration 184# support for page migration
176# 185#
177config MIGRATION 186config MIGRATION
@@ -180,9 +189,11 @@ config MIGRATION
180 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE 189 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE
181 help 190 help
182 Allows the migration of the physical location of pages of processes 191 Allows the migration of the physical location of pages of processes
183 while the virtual addresses are not changed. This is useful for 192 while the virtual addresses are not changed. This is useful in
184 example on NUMA systems to put pages nearer to the processors accessing 193 two situations. The first is on NUMA systems to put pages nearer
185 the page. 194 to the processors accessing. The second is when allocating huge
195 pages as migration can relocate pages to satisfy a huge page
196 allocation instead of reclaiming.
186 197
187config PHYS_ADDR_T_64BIT 198config PHYS_ADDR_T_64BIT
188 def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT 199 def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
diff --git a/mm/Makefile b/mm/Makefile
index 6c2a73a54a43..8982504bd03b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_NUMA) += mempolicy.o
23obj-$(CONFIG_SPARSEMEM) += sparse.o 23obj-$(CONFIG_SPARSEMEM) += sparse.o
24obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o 24obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
25obj-$(CONFIG_SLOB) += slob.o 25obj-$(CONFIG_SLOB) += slob.o
26obj-$(CONFIG_COMPACTION) += compaction.o
26obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 27obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
27obj-$(CONFIG_KSM) += ksm.o 28obj-$(CONFIG_KSM) += ksm.o
28obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o 29obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
diff --git a/mm/compaction.c b/mm/compaction.c
new file mode 100644
index 000000000000..94cce51b0b35
--- /dev/null
+++ b/mm/compaction.c
@@ -0,0 +1,605 @@
1/*
2 * linux/mm/compaction.c
3 *
4 * Memory compaction for the reduction of external fragmentation. Note that
5 * this heavily depends upon page migration to do all the real heavy
6 * lifting
7 *
8 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
9 */
10#include <linux/swap.h>
11#include <linux/migrate.h>
12#include <linux/compaction.h>
13#include <linux/mm_inline.h>
14#include <linux/backing-dev.h>
15#include <linux/sysctl.h>
16#include <linux/sysfs.h>
17#include "internal.h"
18
19/*
20 * compact_control is used to track pages being migrated and the free pages
21 * they are being migrated to during memory compaction. The free_pfn starts
22 * at the end of a zone and migrate_pfn begins at the start. Movable pages
23 * are moved to the end of a zone during a compaction run and the run
24 * completes when free_pfn <= migrate_pfn
25 */
26struct compact_control {
27 struct list_head freepages; /* List of free pages to migrate to */
28 struct list_head migratepages; /* List of pages being migrated */
29 unsigned long nr_freepages; /* Number of isolated free pages */
30 unsigned long nr_migratepages; /* Number of pages to migrate */
31 unsigned long free_pfn; /* isolate_freepages search base */
32 unsigned long migrate_pfn; /* isolate_migratepages search base */
33
34 /* Account for isolated anon and file pages */
35 unsigned long nr_anon;
36 unsigned long nr_file;
37
38 unsigned int order; /* order a direct compactor needs */
39 int migratetype; /* MOVABLE, RECLAIMABLE etc */
40 struct zone *zone;
41};
42
43static unsigned long release_freepages(struct list_head *freelist)
44{
45 struct page *page, *next;
46 unsigned long count = 0;
47
48 list_for_each_entry_safe(page, next, freelist, lru) {
49 list_del(&page->lru);
50 __free_page(page);
51 count++;
52 }
53
54 return count;
55}
56
57/* Isolate free pages onto a private freelist. Must hold zone->lock */
58static unsigned long isolate_freepages_block(struct zone *zone,
59 unsigned long blockpfn,
60 struct list_head *freelist)
61{
62 unsigned long zone_end_pfn, end_pfn;
63 int total_isolated = 0;
64 struct page *cursor;
65
66 /* Get the last PFN we should scan for free pages at */
67 zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
68 end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn);
69
70 /* Find the first usable PFN in the block to initialse page cursor */
71 for (; blockpfn < end_pfn; blockpfn++) {
72 if (pfn_valid_within(blockpfn))
73 break;
74 }
75 cursor = pfn_to_page(blockpfn);
76
77 /* Isolate free pages. This assumes the block is valid */
78 for (; blockpfn < end_pfn; blockpfn++, cursor++) {
79 int isolated, i;
80 struct page *page = cursor;
81
82 if (!pfn_valid_within(blockpfn))
83 continue;
84
85 if (!PageBuddy(page))
86 continue;
87
88 /* Found a free page, break it into order-0 pages */
89 isolated = split_free_page(page);
90 total_isolated += isolated;
91 for (i = 0; i < isolated; i++) {
92 list_add(&page->lru, freelist);
93 page++;
94 }
95
96 /* If a page was split, advance to the end of it */
97 if (isolated) {
98 blockpfn += isolated - 1;
99 cursor += isolated - 1;
100 }
101 }
102
103 return total_isolated;
104}
105
106/* Returns true if the page is within a block suitable for migration to */
107static bool suitable_migration_target(struct page *page)
108{
109
110 int migratetype = get_pageblock_migratetype(page);
111
112 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
113 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
114 return false;
115
116 /* If the page is a large free page, then allow migration */
117 if (PageBuddy(page) && page_order(page) >= pageblock_order)
118 return true;
119
120 /* If the block is MIGRATE_MOVABLE, allow migration */
121 if (migratetype == MIGRATE_MOVABLE)
122 return true;
123
124 /* Otherwise skip the block */
125 return false;
126}
127
128/*
129 * Based on information in the current compact_control, find blocks
130 * suitable for isolating free pages from and then isolate them.
131 */
132static void isolate_freepages(struct zone *zone,
133 struct compact_control *cc)
134{
135 struct page *page;
136 unsigned long high_pfn, low_pfn, pfn;
137 unsigned long flags;
138 int nr_freepages = cc->nr_freepages;
139 struct list_head *freelist = &cc->freepages;
140
141 pfn = cc->free_pfn;
142 low_pfn = cc->migrate_pfn + pageblock_nr_pages;
143 high_pfn = low_pfn;
144
145 /*
146 * Isolate free pages until enough are available to migrate the
147 * pages on cc->migratepages. We stop searching if the migrate
148 * and free page scanners meet or enough free pages are isolated.
149 */
150 spin_lock_irqsave(&zone->lock, flags);
151 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
152 pfn -= pageblock_nr_pages) {
153 unsigned long isolated;
154
155 if (!pfn_valid(pfn))
156 continue;
157
158 /*
159 * Check for overlapping nodes/zones. It's possible on some
160 * configurations to have a setup like
161 * node0 node1 node0
162 * i.e. it's possible that all pages within a zones range of
163 * pages do not belong to a single zone.
164 */
165 page = pfn_to_page(pfn);
166 if (page_zone(page) != zone)
167 continue;
168
169 /* Check the block is suitable for migration */
170 if (!suitable_migration_target(page))
171 continue;
172
173 /* Found a block suitable for isolating free pages from */
174 isolated = isolate_freepages_block(zone, pfn, freelist);
175 nr_freepages += isolated;
176
177 /*
178 * Record the highest PFN we isolated pages from. When next
179 * looking for free pages, the search will restart here as
180 * page migration may have returned some pages to the allocator
181 */
182 if (isolated)
183 high_pfn = max(high_pfn, pfn);
184 }
185 spin_unlock_irqrestore(&zone->lock, flags);
186
187 /* split_free_page does not map the pages */
188 list_for_each_entry(page, freelist, lru) {
189 arch_alloc_page(page, 0);
190 kernel_map_pages(page, 1, 1);
191 }
192
193 cc->free_pfn = high_pfn;
194 cc->nr_freepages = nr_freepages;
195}
196
197/* Update the number of anon and file isolated pages in the zone */
198static void acct_isolated(struct zone *zone, struct compact_control *cc)
199{
200 struct page *page;
201 unsigned int count[NR_LRU_LISTS] = { 0, };
202
203 list_for_each_entry(page, &cc->migratepages, lru) {
204 int lru = page_lru_base_type(page);
205 count[lru]++;
206 }
207
208 cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
209 cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
210 __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
211 __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
212}
213
214/* Similar to reclaim, but different enough that they don't share logic */
215static bool too_many_isolated(struct zone *zone)
216{
217
218 unsigned long inactive, isolated;
219
220 inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
221 zone_page_state(zone, NR_INACTIVE_ANON);
222 isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
223 zone_page_state(zone, NR_ISOLATED_ANON);
224
225 return isolated > inactive;
226}
227
228/*
229 * Isolate all pages that can be migrated from the block pointed to by
230 * the migrate scanner within compact_control.
231 */
232static unsigned long isolate_migratepages(struct zone *zone,
233 struct compact_control *cc)
234{
235 unsigned long low_pfn, end_pfn;
236 struct list_head *migratelist = &cc->migratepages;
237
238 /* Do not scan outside zone boundaries */
239 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
240
241 /* Only scan within a pageblock boundary */
242 end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
243
244 /* Do not cross the free scanner or scan within a memory hole */
245 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
246 cc->migrate_pfn = end_pfn;
247 return 0;
248 }
249
250 /*
251 * Ensure that there are not too many pages isolated from the LRU
252 * list by either parallel reclaimers or compaction. If there are,
253 * delay for some time until fewer pages are isolated
254 */
255 while (unlikely(too_many_isolated(zone))) {
256 congestion_wait(BLK_RW_ASYNC, HZ/10);
257
258 if (fatal_signal_pending(current))
259 return 0;
260 }
261
262 /* Time to isolate some pages for migration */
263 spin_lock_irq(&zone->lru_lock);
264 for (; low_pfn < end_pfn; low_pfn++) {
265 struct page *page;
266 if (!pfn_valid_within(low_pfn))
267 continue;
268
269 /* Get the page and skip if free */
270 page = pfn_to_page(low_pfn);
271 if (PageBuddy(page))
272 continue;
273
274 /* Try isolate the page */
275 if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
276 continue;
277
278 /* Successfully isolated */
279 del_page_from_lru_list(zone, page, page_lru(page));
280 list_add(&page->lru, migratelist);
281 mem_cgroup_del_lru(page);
282 cc->nr_migratepages++;
283
284 /* Avoid isolating too much */
285 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
286 break;
287 }
288
289 acct_isolated(zone, cc);
290
291 spin_unlock_irq(&zone->lru_lock);
292 cc->migrate_pfn = low_pfn;
293
294 return cc->nr_migratepages;
295}
296
297/*
298 * This is a migrate-callback that "allocates" freepages by taking pages
299 * from the isolated freelists in the block we are migrating to.
300 */
301static struct page *compaction_alloc(struct page *migratepage,
302 unsigned long data,
303 int **result)
304{
305 struct compact_control *cc = (struct compact_control *)data;
306 struct page *freepage;
307
308 /* Isolate free pages if necessary */
309 if (list_empty(&cc->freepages)) {
310 isolate_freepages(cc->zone, cc);
311
312 if (list_empty(&cc->freepages))
313 return NULL;
314 }
315
316 freepage = list_entry(cc->freepages.next, struct page, lru);
317 list_del(&freepage->lru);
318 cc->nr_freepages--;
319
320 return freepage;
321}
322
323/*
324 * We cannot control nr_migratepages and nr_freepages fully when migration is
325 * running as migrate_pages() has no knowledge of compact_control. When
326 * migration is complete, we count the number of pages on the lists by hand.
327 */
328static void update_nr_listpages(struct compact_control *cc)
329{
330 int nr_migratepages = 0;
331 int nr_freepages = 0;
332 struct page *page;
333
334 list_for_each_entry(page, &cc->migratepages, lru)
335 nr_migratepages++;
336 list_for_each_entry(page, &cc->freepages, lru)
337 nr_freepages++;
338
339 cc->nr_migratepages = nr_migratepages;
340 cc->nr_freepages = nr_freepages;
341}
342
343static int compact_finished(struct zone *zone,
344 struct compact_control *cc)
345{
346 unsigned int order;
347 unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
348
349 if (fatal_signal_pending(current))
350 return COMPACT_PARTIAL;
351
352 /* Compaction run completes if the migrate and free scanner meet */
353 if (cc->free_pfn <= cc->migrate_pfn)
354 return COMPACT_COMPLETE;
355
356 /* Compaction run is not finished if the watermark is not met */
357 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
358 return COMPACT_CONTINUE;
359
360 if (cc->order == -1)
361 return COMPACT_CONTINUE;
362
363 /* Direct compactor: Is a suitable page free? */
364 for (order = cc->order; order < MAX_ORDER; order++) {
365 /* Job done if page is free of the right migratetype */
366 if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
367 return COMPACT_PARTIAL;
368
369 /* Job done if allocation would set block type */
370 if (order >= pageblock_order && zone->free_area[order].nr_free)
371 return COMPACT_PARTIAL;
372 }
373
374 return COMPACT_CONTINUE;
375}
376
377static int compact_zone(struct zone *zone, struct compact_control *cc)
378{
379 int ret;
380
381 /* Setup to move all movable pages to the end of the zone */
382 cc->migrate_pfn = zone->zone_start_pfn;
383 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
384 cc->free_pfn &= ~(pageblock_nr_pages-1);
385
386 migrate_prep_local();
387
388 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
389 unsigned long nr_migrate, nr_remaining;
390
391 if (!isolate_migratepages(zone, cc))
392 continue;
393
394 nr_migrate = cc->nr_migratepages;
395 migrate_pages(&cc->migratepages, compaction_alloc,
396 (unsigned long)cc, 0);
397 update_nr_listpages(cc);
398 nr_remaining = cc->nr_migratepages;
399
400 count_vm_event(COMPACTBLOCKS);
401 count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
402 if (nr_remaining)
403 count_vm_events(COMPACTPAGEFAILED, nr_remaining);
404
405 /* Release LRU pages not migrated */
406 if (!list_empty(&cc->migratepages)) {
407 putback_lru_pages(&cc->migratepages);
408 cc->nr_migratepages = 0;
409 }
410
411 }
412
413 /* Release free pages and check accounting */
414 cc->nr_freepages -= release_freepages(&cc->freepages);
415 VM_BUG_ON(cc->nr_freepages != 0);
416
417 return ret;
418}
419
420static unsigned long compact_zone_order(struct zone *zone,
421 int order, gfp_t gfp_mask)
422{
423 struct compact_control cc = {
424 .nr_freepages = 0,
425 .nr_migratepages = 0,
426 .order = order,
427 .migratetype = allocflags_to_migratetype(gfp_mask),
428 .zone = zone,
429 };
430 INIT_LIST_HEAD(&cc.freepages);
431 INIT_LIST_HEAD(&cc.migratepages);
432
433 return compact_zone(zone, &cc);
434}
435
436int sysctl_extfrag_threshold = 500;
437
438/**
439 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
440 * @zonelist: The zonelist used for the current allocation
441 * @order: The order of the current allocation
442 * @gfp_mask: The GFP mask of the current allocation
443 * @nodemask: The allowed nodes to allocate from
444 *
445 * This is the main entry point for direct page compaction.
446 */
447unsigned long try_to_compact_pages(struct zonelist *zonelist,
448 int order, gfp_t gfp_mask, nodemask_t *nodemask)
449{
450 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
451 int may_enter_fs = gfp_mask & __GFP_FS;
452 int may_perform_io = gfp_mask & __GFP_IO;
453 unsigned long watermark;
454 struct zoneref *z;
455 struct zone *zone;
456 int rc = COMPACT_SKIPPED;
457
458 /*
459 * Check whether it is worth even starting compaction. The order check is
460 * made because an assumption is made that the page allocator can satisfy
461 * the "cheaper" orders without taking special steps
462 */
463 if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io)
464 return rc;
465
466 count_vm_event(COMPACTSTALL);
467
468 /* Compact each zone in the list */
469 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
470 nodemask) {
471 int fragindex;
472 int status;
473
474 /*
475 * Watermarks for order-0 must be met for compaction. Note
476 * the 2UL. This is because during migration, copies of
477 * pages need to be allocated and for a short time, the
478 * footprint is higher
479 */
480 watermark = low_wmark_pages(zone) + (2UL << order);
481 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
482 continue;
483
484 /*
485 * fragmentation index determines if allocation failures are
486 * due to low memory or external fragmentation
487 *
488 * index of -1 implies allocations might succeed depending
489 * on watermarks
490 * index towards 0 implies failure is due to lack of memory
491 * index towards 1000 implies failure is due to fragmentation
492 *
493 * Only compact if a failure would be due to fragmentation.
494 */
495 fragindex = fragmentation_index(zone, order);
496 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
497 continue;
498
499 if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
500 rc = COMPACT_PARTIAL;
501 break;
502 }
503
504 status = compact_zone_order(zone, order, gfp_mask);
505 rc = max(status, rc);
506
507 if (zone_watermark_ok(zone, order, watermark, 0, 0))
508 break;
509 }
510
511 return rc;
512}
513
514
515/* Compact all zones within a node */
516static int compact_node(int nid)
517{
518 int zoneid;
519 pg_data_t *pgdat;
520 struct zone *zone;
521
522 if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
523 return -EINVAL;
524 pgdat = NODE_DATA(nid);
525
526 /* Flush pending updates to the LRU lists */
527 lru_add_drain_all();
528
529 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
530 struct compact_control cc = {
531 .nr_freepages = 0,
532 .nr_migratepages = 0,
533 .order = -1,
534 };
535
536 zone = &pgdat->node_zones[zoneid];
537 if (!populated_zone(zone))
538 continue;
539
540 cc.zone = zone;
541 INIT_LIST_HEAD(&cc.freepages);
542 INIT_LIST_HEAD(&cc.migratepages);
543
544 compact_zone(zone, &cc);
545
546 VM_BUG_ON(!list_empty(&cc.freepages));
547 VM_BUG_ON(!list_empty(&cc.migratepages));
548 }
549
550 return 0;
551}
552
553/* Compact all nodes in the system */
554static int compact_nodes(void)
555{
556 int nid;
557
558 for_each_online_node(nid)
559 compact_node(nid);
560
561 return COMPACT_COMPLETE;
562}
563
564/* The written value is actually unused, all memory is compacted */
565int sysctl_compact_memory;
566
567/* This is the entry point for compacting all nodes via /proc/sys/vm */
568int sysctl_compaction_handler(struct ctl_table *table, int write,
569 void __user *buffer, size_t *length, loff_t *ppos)
570{
571 if (write)
572 return compact_nodes();
573
574 return 0;
575}
576
577int sysctl_extfrag_handler(struct ctl_table *table, int write,
578 void __user *buffer, size_t *length, loff_t *ppos)
579{
580 proc_dointvec_minmax(table, write, buffer, length, ppos);
581
582 return 0;
583}
584
585#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
586ssize_t sysfs_compact_node(struct sys_device *dev,
587 struct sysdev_attribute *attr,
588 const char *buf, size_t count)
589{
590 compact_node(dev->id);
591
592 return count;
593}
594static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
595
596int compaction_register_node(struct node *node)
597{
598 return sysdev_create_file(&node->sysdev, &attr_compact);
599}
600
601void compaction_unregister_node(struct node *node)
602{
603 return sysdev_remove_file(&node->sysdev, &attr_compact);
604}
605#endif /* CONFIG_SYSFS && CONFIG_NUMA */
diff --git a/mm/filemap.c b/mm/filemap.c
index 140ebda9640f..88d719665a28 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -441,7 +441,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
441 /* 441 /*
442 * Splice_read and readahead add shmem/tmpfs pages into the page cache 442 * Splice_read and readahead add shmem/tmpfs pages into the page cache
443 * before shmem_readpage has a chance to mark them as SwapBacked: they 443 * before shmem_readpage has a chance to mark them as SwapBacked: they
444 * need to go on the active_anon lru below, and mem_cgroup_cache_charge 444 * need to go on the anon lru below, and mem_cgroup_cache_charge
445 * (called in add_to_page_cache) needs to know where they're going too. 445 * (called in add_to_page_cache) needs to know where they're going too.
446 */ 446 */
447 if (mapping_cap_swap_backed(mapping)) 447 if (mapping_cap_swap_backed(mapping))
@@ -452,7 +452,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
452 if (page_is_file_cache(page)) 452 if (page_is_file_cache(page))
453 lru_cache_add_file(page); 453 lru_cache_add_file(page);
454 else 454 else
455 lru_cache_add_active_anon(page); 455 lru_cache_add_anon(page);
456 } 456 }
457 return ret; 457 return ret;
458} 458}
@@ -461,9 +461,15 @@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
461#ifdef CONFIG_NUMA 461#ifdef CONFIG_NUMA
462struct page *__page_cache_alloc(gfp_t gfp) 462struct page *__page_cache_alloc(gfp_t gfp)
463{ 463{
464 int n;
465 struct page *page;
466
464 if (cpuset_do_page_mem_spread()) { 467 if (cpuset_do_page_mem_spread()) {
465 int n = cpuset_mem_spread_node(); 468 get_mems_allowed();
466 return alloc_pages_exact_node(n, gfp, 0); 469 n = cpuset_mem_spread_node();
470 page = alloc_pages_exact_node(n, gfp, 0);
471 put_mems_allowed();
472 return page;
467 } 473 }
468 return alloc_pages(gfp, 0); 474 return alloc_pages(gfp, 0);
469} 475}
diff --git a/mm/highmem.c b/mm/highmem.c
index bed8a8bfd01f..66baa20f78f5 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -422,7 +422,7 @@ void __init page_address_init(void)
422 422
423#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ 423#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
424 424
425#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT) 425#ifdef CONFIG_DEBUG_HIGHMEM
426 426
427void debug_kmap_atomic(enum km_type type) 427void debug_kmap_atomic(enum km_type type)
428{ 428{
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4c9e6bbf3772..54d42b009dbe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -465,11 +465,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
465 struct page *page = NULL; 465 struct page *page = NULL;
466 struct mempolicy *mpol; 466 struct mempolicy *mpol;
467 nodemask_t *nodemask; 467 nodemask_t *nodemask;
468 struct zonelist *zonelist = huge_zonelist(vma, address, 468 struct zonelist *zonelist;
469 htlb_alloc_mask, &mpol, &nodemask);
470 struct zone *zone; 469 struct zone *zone;
471 struct zoneref *z; 470 struct zoneref *z;
472 471
472 get_mems_allowed();
473 zonelist = huge_zonelist(vma, address,
474 htlb_alloc_mask, &mpol, &nodemask);
473 /* 475 /*
474 * A child process with MAP_PRIVATE mappings created by their parent 476 * A child process with MAP_PRIVATE mappings created by their parent
475 * have no page reserves. This check ensures that reservations are 477 * have no page reserves. This check ensures that reservations are
@@ -477,11 +479,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
477 */ 479 */
478 if (!vma_has_reserves(vma) && 480 if (!vma_has_reserves(vma) &&
479 h->free_huge_pages - h->resv_huge_pages == 0) 481 h->free_huge_pages - h->resv_huge_pages == 0)
480 return NULL; 482 goto err;
481 483
482 /* If reserves cannot be used, ensure enough pages are in the pool */ 484 /* If reserves cannot be used, ensure enough pages are in the pool */
483 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 485 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
484 return NULL; 486 goto err;;
485 487
486 for_each_zone_zonelist_nodemask(zone, z, zonelist, 488 for_each_zone_zonelist_nodemask(zone, z, zonelist,
487 MAX_NR_ZONES - 1, nodemask) { 489 MAX_NR_ZONES - 1, nodemask) {
@@ -500,7 +502,9 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
500 break; 502 break;
501 } 503 }
502 } 504 }
505err:
503 mpol_cond_put(mpol); 506 mpol_cond_put(mpol);
507 put_mems_allowed();
504 return page; 508 return page;
505} 509}
506 510
diff --git a/mm/ksm.c b/mm/ksm.c
index 956880f2ff49..6c3e99b4ae7c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -318,14 +318,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item,
318 struct anon_vma *anon_vma) 318 struct anon_vma *anon_vma)
319{ 319{
320 rmap_item->anon_vma = anon_vma; 320 rmap_item->anon_vma = anon_vma;
321 atomic_inc(&anon_vma->ksm_refcount); 321 atomic_inc(&anon_vma->external_refcount);
322} 322}
323 323
324static void drop_anon_vma(struct rmap_item *rmap_item) 324static void drop_anon_vma(struct rmap_item *rmap_item)
325{ 325{
326 struct anon_vma *anon_vma = rmap_item->anon_vma; 326 struct anon_vma *anon_vma = rmap_item->anon_vma;
327 327
328 if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) { 328 if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
329 int empty = list_empty(&anon_vma->head); 329 int empty = list_empty(&anon_vma->head);
330 spin_unlock(&anon_vma->lock); 330 spin_unlock(&anon_vma->lock);
331 if (empty) 331 if (empty)
diff --git a/mm/memory.c b/mm/memory.c
index 833952d8b74d..119b7ccdf39b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1227,8 +1227,17 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1227} 1227}
1228EXPORT_SYMBOL_GPL(zap_vma_ptes); 1228EXPORT_SYMBOL_GPL(zap_vma_ptes);
1229 1229
1230/* 1230/**
1231 * Do a quick page-table lookup for a single page. 1231 * follow_page - look up a page descriptor from a user-virtual address
1232 * @vma: vm_area_struct mapping @address
1233 * @address: virtual address to look up
1234 * @flags: flags modifying lookup behaviour
1235 *
1236 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
1237 *
1238 * Returns the mapped (struct page *), %NULL if no mapping exists, or
1239 * an error pointer if there is a mapping to something not represented
1240 * by a page descriptor (see also vm_normal_page()).
1232 */ 1241 */
1233struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 1242struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1234 unsigned int flags) 1243 unsigned int flags)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index be211a582930..a4cfcdc00455 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -415,12 +415,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
415 * This means the page allocator ignores this zone. 415 * This means the page allocator ignores this zone.
416 * So, zonelist must be updated after online. 416 * So, zonelist must be updated after online.
417 */ 417 */
418 mutex_lock(&zonelists_mutex);
418 if (!populated_zone(zone)) 419 if (!populated_zone(zone))
419 need_zonelists_rebuild = 1; 420 need_zonelists_rebuild = 1;
420 421
421 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 422 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
422 online_pages_range); 423 online_pages_range);
423 if (ret) { 424 if (ret) {
425 mutex_unlock(&zonelists_mutex);
424 printk(KERN_DEBUG "online_pages %lx at %lx failed\n", 426 printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
425 nr_pages, pfn); 427 nr_pages, pfn);
426 memory_notify(MEM_CANCEL_ONLINE, &arg); 428 memory_notify(MEM_CANCEL_ONLINE, &arg);
@@ -429,8 +431,12 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
429 431
430 zone->present_pages += onlined_pages; 432 zone->present_pages += onlined_pages;
431 zone->zone_pgdat->node_present_pages += onlined_pages; 433 zone->zone_pgdat->node_present_pages += onlined_pages;
434 if (need_zonelists_rebuild)
435 build_all_zonelists(zone);
436 else
437 zone_pcp_update(zone);
432 438
433 zone_pcp_update(zone); 439 mutex_unlock(&zonelists_mutex);
434 setup_per_zone_wmarks(); 440 setup_per_zone_wmarks();
435 calculate_zone_inactive_ratio(zone); 441 calculate_zone_inactive_ratio(zone);
436 if (onlined_pages) { 442 if (onlined_pages) {
@@ -438,10 +444,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
438 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); 444 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
439 } 445 }
440 446
441 if (need_zonelists_rebuild) 447 vm_total_pages = nr_free_pagecache_pages();
442 build_all_zonelists();
443 else
444 vm_total_pages = nr_free_pagecache_pages();
445 448
446 writeback_set_ratelimit(); 449 writeback_set_ratelimit();
447 450
@@ -482,6 +485,29 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
482} 485}
483 486
484 487
488/*
489 * called by cpu_up() to online a node without onlined memory.
490 */
491int mem_online_node(int nid)
492{
493 pg_data_t *pgdat;
494 int ret;
495
496 lock_system_sleep();
497 pgdat = hotadd_new_pgdat(nid, 0);
498 if (pgdat) {
499 ret = -ENOMEM;
500 goto out;
501 }
502 node_set_online(nid);
503 ret = register_one_node(nid);
504 BUG_ON(ret);
505
506out:
507 unlock_system_sleep();
508 return ret;
509}
510
485/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 511/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
486int __ref add_memory(int nid, u64 start, u64 size) 512int __ref add_memory(int nid, u64 start, u64 size)
487{ 513{
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 08f40a2f3fe0..75751012c552 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -119,7 +119,22 @@ struct mempolicy default_policy = {
119 119
120static const struct mempolicy_operations { 120static const struct mempolicy_operations {
121 int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 121 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
122 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); 122 /*
123 * If read-side task has no lock to protect task->mempolicy, write-side
124 * task will rebind the task->mempolicy by two step. The first step is
125 * setting all the newly nodes, and the second step is cleaning all the
126 * disallowed nodes. In this way, we can avoid finding no node to alloc
127 * page.
128 * If we have a lock to protect task->mempolicy in read-side, we do
129 * rebind directly.
130 *
131 * step:
132 * MPOL_REBIND_ONCE - do rebind work at once
133 * MPOL_REBIND_STEP1 - set all the newly nodes
134 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
135 */
136 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
137 enum mpol_rebind_step step);
123} mpol_ops[MPOL_MAX]; 138} mpol_ops[MPOL_MAX];
124 139
125/* Check that the nodemask contains at least one populated zone */ 140/* Check that the nodemask contains at least one populated zone */
@@ -127,9 +142,6 @@ static int is_valid_nodemask(const nodemask_t *nodemask)
127{ 142{
128 int nd, k; 143 int nd, k;
129 144
130 /* Check that there is something useful in this mask */
131 k = policy_zone;
132
133 for_each_node_mask(nd, *nodemask) { 145 for_each_node_mask(nd, *nodemask) {
134 struct zone *z; 146 struct zone *z;
135 147
@@ -145,7 +157,7 @@ static int is_valid_nodemask(const nodemask_t *nodemask)
145 157
146static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 158static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
147{ 159{
148 return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES); 160 return pol->flags & MPOL_MODE_FLAGS;
149} 161}
150 162
151static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, 163static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
@@ -277,12 +289,19 @@ void __mpol_put(struct mempolicy *p)
277 kmem_cache_free(policy_cache, p); 289 kmem_cache_free(policy_cache, p);
278} 290}
279 291
280static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) 292static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
293 enum mpol_rebind_step step)
281{ 294{
282} 295}
283 296
284static void mpol_rebind_nodemask(struct mempolicy *pol, 297/*
285 const nodemask_t *nodes) 298 * step:
299 * MPOL_REBIND_ONCE - do rebind work at once
300 * MPOL_REBIND_STEP1 - set all the newly nodes
301 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
302 */
303static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
304 enum mpol_rebind_step step)
286{ 305{
287 nodemask_t tmp; 306 nodemask_t tmp;
288 307
@@ -291,12 +310,31 @@ static void mpol_rebind_nodemask(struct mempolicy *pol,
291 else if (pol->flags & MPOL_F_RELATIVE_NODES) 310 else if (pol->flags & MPOL_F_RELATIVE_NODES)
292 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 311 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
293 else { 312 else {
294 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, 313 /*
295 *nodes); 314 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
296 pol->w.cpuset_mems_allowed = *nodes; 315 * result
316 */
317 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
318 nodes_remap(tmp, pol->v.nodes,
319 pol->w.cpuset_mems_allowed, *nodes);
320 pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
321 } else if (step == MPOL_REBIND_STEP2) {
322 tmp = pol->w.cpuset_mems_allowed;
323 pol->w.cpuset_mems_allowed = *nodes;
324 } else
325 BUG();
297 } 326 }
298 327
299 pol->v.nodes = tmp; 328 if (nodes_empty(tmp))
329 tmp = *nodes;
330
331 if (step == MPOL_REBIND_STEP1)
332 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
333 else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
334 pol->v.nodes = tmp;
335 else
336 BUG();
337
300 if (!node_isset(current->il_next, tmp)) { 338 if (!node_isset(current->il_next, tmp)) {
301 current->il_next = next_node(current->il_next, tmp); 339 current->il_next = next_node(current->il_next, tmp);
302 if (current->il_next >= MAX_NUMNODES) 340 if (current->il_next >= MAX_NUMNODES)
@@ -307,7 +345,8 @@ static void mpol_rebind_nodemask(struct mempolicy *pol,
307} 345}
308 346
309static void mpol_rebind_preferred(struct mempolicy *pol, 347static void mpol_rebind_preferred(struct mempolicy *pol,
310 const nodemask_t *nodes) 348 const nodemask_t *nodes,
349 enum mpol_rebind_step step)
311{ 350{
312 nodemask_t tmp; 351 nodemask_t tmp;
313 352
@@ -330,16 +369,45 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
330 } 369 }
331} 370}
332 371
333/* Migrate a policy to a different set of nodes */ 372/*
334static void mpol_rebind_policy(struct mempolicy *pol, 373 * mpol_rebind_policy - Migrate a policy to a different set of nodes
335 const nodemask_t *newmask) 374 *
375 * If read-side task has no lock to protect task->mempolicy, write-side
376 * task will rebind the task->mempolicy by two step. The first step is
377 * setting all the newly nodes, and the second step is cleaning all the
378 * disallowed nodes. In this way, we can avoid finding no node to alloc
379 * page.
380 * If we have a lock to protect task->mempolicy in read-side, we do
381 * rebind directly.
382 *
383 * step:
384 * MPOL_REBIND_ONCE - do rebind work at once
385 * MPOL_REBIND_STEP1 - set all the newly nodes
386 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
387 */
388static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
389 enum mpol_rebind_step step)
336{ 390{
337 if (!pol) 391 if (!pol)
338 return; 392 return;
339 if (!mpol_store_user_nodemask(pol) && 393 if (!mpol_store_user_nodemask(pol) && step == 0 &&
340 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 394 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
341 return; 395 return;
342 mpol_ops[pol->mode].rebind(pol, newmask); 396
397 if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
398 return;
399
400 if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
401 BUG();
402
403 if (step == MPOL_REBIND_STEP1)
404 pol->flags |= MPOL_F_REBINDING;
405 else if (step == MPOL_REBIND_STEP2)
406 pol->flags &= ~MPOL_F_REBINDING;
407 else if (step >= MPOL_REBIND_NSTEP)
408 BUG();
409
410 mpol_ops[pol->mode].rebind(pol, newmask, step);
343} 411}
344 412
345/* 413/*
@@ -349,9 +417,10 @@ static void mpol_rebind_policy(struct mempolicy *pol,
349 * Called with task's alloc_lock held. 417 * Called with task's alloc_lock held.
350 */ 418 */
351 419
352void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 420void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
421 enum mpol_rebind_step step)
353{ 422{
354 mpol_rebind_policy(tsk->mempolicy, new); 423 mpol_rebind_policy(tsk->mempolicy, new, step);
355} 424}
356 425
357/* 426/*
@@ -366,7 +435,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
366 435
367 down_write(&mm->mmap_sem); 436 down_write(&mm->mmap_sem);
368 for (vma = mm->mmap; vma; vma = vma->vm_next) 437 for (vma = mm->mmap; vma; vma = vma->vm_next)
369 mpol_rebind_policy(vma->vm_policy, new); 438 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
370 up_write(&mm->mmap_sem); 439 up_write(&mm->mmap_sem);
371} 440}
372 441
@@ -859,7 +928,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
859 nodes_clear(nmask); 928 nodes_clear(nmask);
860 node_set(source, nmask); 929 node_set(source, nmask);
861 930
862 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, 931 check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
863 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 932 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
864 933
865 if (!list_empty(&pagelist)) 934 if (!list_empty(&pagelist))
@@ -1444,15 +1513,13 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1444 /* 1513 /*
1445 * Normally, MPOL_BIND allocations are node-local within the 1514 * Normally, MPOL_BIND allocations are node-local within the
1446 * allowed nodemask. However, if __GFP_THISNODE is set and the 1515 * allowed nodemask. However, if __GFP_THISNODE is set and the
1447 * current node is part of the mask, we use the zonelist for 1516 * current node isn't part of the mask, we use the zonelist for
1448 * the first node in the mask instead. 1517 * the first node in the mask instead.
1449 */ 1518 */
1450 if (unlikely(gfp & __GFP_THISNODE) && 1519 if (unlikely(gfp & __GFP_THISNODE) &&
1451 unlikely(!node_isset(nd, policy->v.nodes))) 1520 unlikely(!node_isset(nd, policy->v.nodes)))
1452 nd = first_node(policy->v.nodes); 1521 nd = first_node(policy->v.nodes);
1453 break; 1522 break;
1454 case MPOL_INTERLEAVE: /* should not happen */
1455 break;
1456 default: 1523 default:
1457 BUG(); 1524 BUG();
1458 } 1525 }
@@ -1572,6 +1639,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1572 * to the struct mempolicy for conditional unref after allocation. 1639 * to the struct mempolicy for conditional unref after allocation.
1573 * If the effective policy is 'BIND, returns a pointer to the mempolicy's 1640 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1574 * @nodemask for filtering the zonelist. 1641 * @nodemask for filtering the zonelist.
1642 *
1643 * Must be protected by get_mems_allowed()
1575 */ 1644 */
1576struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 1645struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1577 gfp_t gfp_flags, struct mempolicy **mpol, 1646 gfp_t gfp_flags, struct mempolicy **mpol,
@@ -1617,6 +1686,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
1617 if (!(mask && current->mempolicy)) 1686 if (!(mask && current->mempolicy))
1618 return false; 1687 return false;
1619 1688
1689 task_lock(current);
1620 mempolicy = current->mempolicy; 1690 mempolicy = current->mempolicy;
1621 switch (mempolicy->mode) { 1691 switch (mempolicy->mode) {
1622 case MPOL_PREFERRED: 1692 case MPOL_PREFERRED:
@@ -1636,6 +1706,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
1636 default: 1706 default:
1637 BUG(); 1707 BUG();
1638 } 1708 }
1709 task_unlock(current);
1639 1710
1640 return true; 1711 return true;
1641} 1712}
@@ -1683,13 +1754,17 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1683{ 1754{
1684 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1755 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1685 struct zonelist *zl; 1756 struct zonelist *zl;
1757 struct page *page;
1686 1758
1759 get_mems_allowed();
1687 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1760 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1688 unsigned nid; 1761 unsigned nid;
1689 1762
1690 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1763 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1691 mpol_cond_put(pol); 1764 mpol_cond_put(pol);
1692 return alloc_page_interleave(gfp, 0, nid); 1765 page = alloc_page_interleave(gfp, 0, nid);
1766 put_mems_allowed();
1767 return page;
1693 } 1768 }
1694 zl = policy_zonelist(gfp, pol); 1769 zl = policy_zonelist(gfp, pol);
1695 if (unlikely(mpol_needs_cond_ref(pol))) { 1770 if (unlikely(mpol_needs_cond_ref(pol))) {
@@ -1699,12 +1774,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1699 struct page *page = __alloc_pages_nodemask(gfp, 0, 1774 struct page *page = __alloc_pages_nodemask(gfp, 0,
1700 zl, policy_nodemask(gfp, pol)); 1775 zl, policy_nodemask(gfp, pol));
1701 __mpol_put(pol); 1776 __mpol_put(pol);
1777 put_mems_allowed();
1702 return page; 1778 return page;
1703 } 1779 }
1704 /* 1780 /*
1705 * fast path: default or task policy 1781 * fast path: default or task policy
1706 */ 1782 */
1707 return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); 1783 page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1784 put_mems_allowed();
1785 return page;
1708} 1786}
1709 1787
1710/** 1788/**
@@ -1729,18 +1807,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1729struct page *alloc_pages_current(gfp_t gfp, unsigned order) 1807struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1730{ 1808{
1731 struct mempolicy *pol = current->mempolicy; 1809 struct mempolicy *pol = current->mempolicy;
1810 struct page *page;
1732 1811
1733 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1812 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1734 pol = &default_policy; 1813 pol = &default_policy;
1735 1814
1815 get_mems_allowed();
1736 /* 1816 /*
1737 * No reference counting needed for current->mempolicy 1817 * No reference counting needed for current->mempolicy
1738 * nor system default_policy 1818 * nor system default_policy
1739 */ 1819 */
1740 if (pol->mode == MPOL_INTERLEAVE) 1820 if (pol->mode == MPOL_INTERLEAVE)
1741 return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1821 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
1742 return __alloc_pages_nodemask(gfp, order, 1822 else
1823 page = __alloc_pages_nodemask(gfp, order,
1743 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); 1824 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1825 put_mems_allowed();
1826 return page;
1744} 1827}
1745EXPORT_SYMBOL(alloc_pages_current); 1828EXPORT_SYMBOL(alloc_pages_current);
1746 1829
@@ -1750,6 +1833,9 @@ EXPORT_SYMBOL(alloc_pages_current);
1750 * with the mems_allowed returned by cpuset_mems_allowed(). This 1833 * with the mems_allowed returned by cpuset_mems_allowed(). This
1751 * keeps mempolicies cpuset relative after its cpuset moves. See 1834 * keeps mempolicies cpuset relative after its cpuset moves. See
1752 * further kernel/cpuset.c update_nodemask(). 1835 * further kernel/cpuset.c update_nodemask().
1836 *
1837 * current's mempolicy may be rebinded by the other task(the task that changes
1838 * cpuset's mems), so we needn't do rebind work for current task.
1753 */ 1839 */
1754 1840
1755/* Slow path of a mempolicy duplicate */ 1841/* Slow path of a mempolicy duplicate */
@@ -1759,13 +1845,24 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
1759 1845
1760 if (!new) 1846 if (!new)
1761 return ERR_PTR(-ENOMEM); 1847 return ERR_PTR(-ENOMEM);
1848
1849 /* task's mempolicy is protected by alloc_lock */
1850 if (old == current->mempolicy) {
1851 task_lock(current);
1852 *new = *old;
1853 task_unlock(current);
1854 } else
1855 *new = *old;
1856
1762 rcu_read_lock(); 1857 rcu_read_lock();
1763 if (current_cpuset_is_being_rebound()) { 1858 if (current_cpuset_is_being_rebound()) {
1764 nodemask_t mems = cpuset_mems_allowed(current); 1859 nodemask_t mems = cpuset_mems_allowed(current);
1765 mpol_rebind_policy(old, &mems); 1860 if (new->flags & MPOL_F_REBINDING)
1861 mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
1862 else
1863 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
1766 } 1864 }
1767 rcu_read_unlock(); 1865 rcu_read_unlock();
1768 *new = *old;
1769 atomic_set(&new->refcnt, 1); 1866 atomic_set(&new->refcnt, 1);
1770 return new; 1867 return new;
1771} 1868}
@@ -1792,16 +1889,6 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1792 return tompol; 1889 return tompol;
1793} 1890}
1794 1891
1795static int mpol_match_intent(const struct mempolicy *a,
1796 const struct mempolicy *b)
1797{
1798 if (a->flags != b->flags)
1799 return 0;
1800 if (!mpol_store_user_nodemask(a))
1801 return 1;
1802 return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1803}
1804
1805/* Slow path of a mempolicy comparison */ 1892/* Slow path of a mempolicy comparison */
1806int __mpol_equal(struct mempolicy *a, struct mempolicy *b) 1893int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1807{ 1894{
@@ -1809,8 +1896,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1809 return 0; 1896 return 0;
1810 if (a->mode != b->mode) 1897 if (a->mode != b->mode)
1811 return 0; 1898 return 0;
1812 if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b)) 1899 if (a->flags != b->flags)
1813 return 0; 1900 return 0;
1901 if (mpol_store_user_nodemask(a))
1902 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
1903 return 0;
1904
1814 switch (a->mode) { 1905 switch (a->mode) {
1815 case MPOL_BIND: 1906 case MPOL_BIND:
1816 /* Fall through */ 1907 /* Fall through */
@@ -2006,26 +2097,22 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2006 return; 2097 return;
2007 /* contextualize the tmpfs mount point mempolicy */ 2098 /* contextualize the tmpfs mount point mempolicy */
2008 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 2099 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2009 if (IS_ERR(new)) { 2100 if (IS_ERR(new))
2010 mpol_put(mpol); /* drop our ref on sb mpol */ 2101 goto put_free; /* no valid nodemask intersection */
2011 NODEMASK_SCRATCH_FREE(scratch);
2012 return; /* no valid nodemask intersection */
2013 }
2014 2102
2015 task_lock(current); 2103 task_lock(current);
2016 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); 2104 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2017 task_unlock(current); 2105 task_unlock(current);
2018 mpol_put(mpol); /* drop our ref on sb mpol */ 2106 mpol_put(mpol); /* drop our ref on sb mpol */
2019 if (ret) { 2107 if (ret)
2020 NODEMASK_SCRATCH_FREE(scratch); 2108 goto put_free;
2021 mpol_put(new);
2022 return;
2023 }
2024 2109
2025 /* Create pseudo-vma that contains just the policy */ 2110 /* Create pseudo-vma that contains just the policy */
2026 memset(&pvma, 0, sizeof(struct vm_area_struct)); 2111 memset(&pvma, 0, sizeof(struct vm_area_struct));
2027 pvma.vm_end = TASK_SIZE; /* policy covers entire file */ 2112 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2028 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ 2113 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2114
2115put_free:
2029 mpol_put(new); /* drop initial ref */ 2116 mpol_put(new); /* drop initial ref */
2030 NODEMASK_SCRATCH_FREE(scratch); 2117 NODEMASK_SCRATCH_FREE(scratch);
2031 } 2118 }
@@ -2132,9 +2219,15 @@ void numa_default_policy(void)
2132 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag 2219 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
2133 * Used only for mpol_parse_str() and mpol_to_str() 2220 * Used only for mpol_parse_str() and mpol_to_str()
2134 */ 2221 */
2135#define MPOL_LOCAL (MPOL_INTERLEAVE + 1) 2222#define MPOL_LOCAL MPOL_MAX
2136static const char * const policy_types[] = 2223static const char * const policy_modes[] =
2137 { "default", "prefer", "bind", "interleave", "local" }; 2224{
2225 [MPOL_DEFAULT] = "default",
2226 [MPOL_PREFERRED] = "prefer",
2227 [MPOL_BIND] = "bind",
2228 [MPOL_INTERLEAVE] = "interleave",
2229 [MPOL_LOCAL] = "local"
2230};
2138 2231
2139 2232
2140#ifdef CONFIG_TMPFS 2233#ifdef CONFIG_TMPFS
@@ -2159,12 +2252,11 @@ static const char * const policy_types[] =
2159int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) 2252int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2160{ 2253{
2161 struct mempolicy *new = NULL; 2254 struct mempolicy *new = NULL;
2162 unsigned short uninitialized_var(mode); 2255 unsigned short mode;
2163 unsigned short uninitialized_var(mode_flags); 2256 unsigned short uninitialized_var(mode_flags);
2164 nodemask_t nodes; 2257 nodemask_t nodes;
2165 char *nodelist = strchr(str, ':'); 2258 char *nodelist = strchr(str, ':');
2166 char *flags = strchr(str, '='); 2259 char *flags = strchr(str, '=');
2167 int i;
2168 int err = 1; 2260 int err = 1;
2169 2261
2170 if (nodelist) { 2262 if (nodelist) {
@@ -2180,13 +2272,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2180 if (flags) 2272 if (flags)
2181 *flags++ = '\0'; /* terminate mode string */ 2273 *flags++ = '\0'; /* terminate mode string */
2182 2274
2183 for (i = 0; i <= MPOL_LOCAL; i++) { 2275 for (mode = 0; mode <= MPOL_LOCAL; mode++) {
2184 if (!strcmp(str, policy_types[i])) { 2276 if (!strcmp(str, policy_modes[mode])) {
2185 mode = i;
2186 break; 2277 break;
2187 } 2278 }
2188 } 2279 }
2189 if (i > MPOL_LOCAL) 2280 if (mode > MPOL_LOCAL)
2190 goto out; 2281 goto out;
2191 2282
2192 switch (mode) { 2283 switch (mode) {
@@ -2250,7 +2341,10 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2250 if (IS_ERR(new)) 2341 if (IS_ERR(new))
2251 goto out; 2342 goto out;
2252 2343
2253 { 2344 if (no_context) {
2345 /* save for contextualization */
2346 new->w.user_nodemask = nodes;
2347 } else {
2254 int ret; 2348 int ret;
2255 NODEMASK_SCRATCH(scratch); 2349 NODEMASK_SCRATCH(scratch);
2256 if (scratch) { 2350 if (scratch) {
@@ -2266,10 +2360,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2266 } 2360 }
2267 } 2361 }
2268 err = 0; 2362 err = 0;
2269 if (no_context) {
2270 /* save for contextualization */
2271 new->w.user_nodemask = nodes;
2272 }
2273 2363
2274out: 2364out:
2275 /* Restore string for error message */ 2365 /* Restore string for error message */
@@ -2338,11 +2428,11 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2338 BUG(); 2428 BUG();
2339 } 2429 }
2340 2430
2341 l = strlen(policy_types[mode]); 2431 l = strlen(policy_modes[mode]);
2342 if (buffer + maxlen < p + l + 1) 2432 if (buffer + maxlen < p + l + 1)
2343 return -ENOSPC; 2433 return -ENOSPC;
2344 2434
2345 strcpy(p, policy_types[mode]); 2435 strcpy(p, policy_modes[mode]);
2346 p += l; 2436 p += l;
2347 2437
2348 if (flags & MPOL_MODE_FLAGS) { 2438 if (flags & MPOL_MODE_FLAGS) {
diff --git a/mm/migrate.c b/mm/migrate.c
index d3f3f7f81075..09e2471afa0f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -40,7 +40,8 @@
40 40
41/* 41/*
42 * migrate_prep() needs to be called before we start compiling a list of pages 42 * migrate_prep() needs to be called before we start compiling a list of pages
43 * to be migrated using isolate_lru_page(). 43 * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
44 * undesirable, use migrate_prep_local()
44 */ 45 */
45int migrate_prep(void) 46int migrate_prep(void)
46{ 47{
@@ -55,26 +56,29 @@ int migrate_prep(void)
55 return 0; 56 return 0;
56} 57}
57 58
59/* Do the necessary work of migrate_prep but not if it involves other CPUs */
60int migrate_prep_local(void)
61{
62 lru_add_drain();
63
64 return 0;
65}
66
58/* 67/*
59 * Add isolated pages on the list back to the LRU under page lock 68 * Add isolated pages on the list back to the LRU under page lock
60 * to avoid leaking evictable pages back onto unevictable list. 69 * to avoid leaking evictable pages back onto unevictable list.
61 *
62 * returns the number of pages put back.
63 */ 70 */
64int putback_lru_pages(struct list_head *l) 71void putback_lru_pages(struct list_head *l)
65{ 72{
66 struct page *page; 73 struct page *page;
67 struct page *page2; 74 struct page *page2;
68 int count = 0;
69 75
70 list_for_each_entry_safe(page, page2, l, lru) { 76 list_for_each_entry_safe(page, page2, l, lru) {
71 list_del(&page->lru); 77 list_del(&page->lru);
72 dec_zone_page_state(page, NR_ISOLATED_ANON + 78 dec_zone_page_state(page, NR_ISOLATED_ANON +
73 page_is_file_cache(page)); 79 page_is_file_cache(page));
74 putback_lru_page(page); 80 putback_lru_page(page);
75 count++;
76 } 81 }
77 return count;
78} 82}
79 83
80/* 84/*
@@ -490,7 +494,8 @@ static int fallback_migrate_page(struct address_space *mapping,
490 * < 0 - error code 494 * < 0 - error code
491 * == 0 - success 495 * == 0 - success
492 */ 496 */
493static int move_to_new_page(struct page *newpage, struct page *page) 497static int move_to_new_page(struct page *newpage, struct page *page,
498 int remap_swapcache)
494{ 499{
495 struct address_space *mapping; 500 struct address_space *mapping;
496 int rc; 501 int rc;
@@ -525,10 +530,12 @@ static int move_to_new_page(struct page *newpage, struct page *page)
525 else 530 else
526 rc = fallback_migrate_page(mapping, newpage, page); 531 rc = fallback_migrate_page(mapping, newpage, page);
527 532
528 if (!rc) 533 if (rc) {
529 remove_migration_ptes(page, newpage);
530 else
531 newpage->mapping = NULL; 534 newpage->mapping = NULL;
535 } else {
536 if (remap_swapcache)
537 remove_migration_ptes(page, newpage);
538 }
532 539
533 unlock_page(newpage); 540 unlock_page(newpage);
534 541
@@ -545,9 +552,11 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
545 int rc = 0; 552 int rc = 0;
546 int *result = NULL; 553 int *result = NULL;
547 struct page *newpage = get_new_page(page, private, &result); 554 struct page *newpage = get_new_page(page, private, &result);
555 int remap_swapcache = 1;
548 int rcu_locked = 0; 556 int rcu_locked = 0;
549 int charge = 0; 557 int charge = 0;
550 struct mem_cgroup *mem = NULL; 558 struct mem_cgroup *mem = NULL;
559 struct anon_vma *anon_vma = NULL;
551 560
552 if (!newpage) 561 if (!newpage)
553 return -ENOMEM; 562 return -ENOMEM;
@@ -604,6 +613,34 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
604 if (PageAnon(page)) { 613 if (PageAnon(page)) {
605 rcu_read_lock(); 614 rcu_read_lock();
606 rcu_locked = 1; 615 rcu_locked = 1;
616
617 /* Determine how to safely use anon_vma */
618 if (!page_mapped(page)) {
619 if (!PageSwapCache(page))
620 goto rcu_unlock;
621
622 /*
623 * We cannot be sure that the anon_vma of an unmapped
624 * swapcache page is safe to use because we don't
625 * know in advance if the VMA that this page belonged
626 * to still exists. If the VMA and others sharing the
627 * data have been freed, then the anon_vma could
628 * already be invalid.
629 *
630 * To avoid this possibility, swapcache pages get
631 * migrated but are not remapped when migration
632 * completes
633 */
634 remap_swapcache = 0;
635 } else {
636 /*
637 * Take a reference count on the anon_vma if the
638 * page is mapped so that it is guaranteed to
639 * exist when the page is remapped later
640 */
641 anon_vma = page_anon_vma(page);
642 atomic_inc(&anon_vma->external_refcount);
643 }
607 } 644 }
608 645
609 /* 646 /*
@@ -638,11 +675,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
638 675
639skip_unmap: 676skip_unmap:
640 if (!page_mapped(page)) 677 if (!page_mapped(page))
641 rc = move_to_new_page(newpage, page); 678 rc = move_to_new_page(newpage, page, remap_swapcache);
642 679
643 if (rc) 680 if (rc && remap_swapcache)
644 remove_migration_ptes(page, page); 681 remove_migration_ptes(page, page);
645rcu_unlock: 682rcu_unlock:
683
684 /* Drop an anon_vma reference if we took one */
685 if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
686 int empty = list_empty(&anon_vma->head);
687 spin_unlock(&anon_vma->lock);
688 if (empty)
689 anon_vma_free(anon_vma);
690 }
691
646 if (rcu_locked) 692 if (rcu_locked)
647 rcu_read_unlock(); 693 rcu_read_unlock();
648uncharge: 694uncharge:
diff --git a/mm/mincore.c b/mm/mincore.c
index f77433c20279..9ac42dc6d7b6 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -19,6 +19,40 @@
19#include <asm/uaccess.h> 19#include <asm/uaccess.h>
20#include <asm/pgtable.h> 20#include <asm/pgtable.h>
21 21
22static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
23 unsigned long addr, unsigned long end,
24 unsigned char *vec)
25{
26#ifdef CONFIG_HUGETLB_PAGE
27 struct hstate *h;
28
29 h = hstate_vma(vma);
30 while (1) {
31 unsigned char present;
32 pte_t *ptep;
33 /*
34 * Huge pages are always in RAM for now, but
35 * theoretically it needs to be checked.
36 */
37 ptep = huge_pte_offset(current->mm,
38 addr & huge_page_mask(h));
39 present = ptep && !huge_pte_none(huge_ptep_get(ptep));
40 while (1) {
41 *vec = present;
42 vec++;
43 addr += PAGE_SIZE;
44 if (addr == end)
45 return;
46 /* check hugepage border */
47 if (!(addr & ~huge_page_mask(h)))
48 break;
49 }
50 }
51#else
52 BUG();
53#endif
54}
55
22/* 56/*
23 * Later we can get more picky about what "in core" means precisely. 57 * Later we can get more picky about what "in core" means precisely.
24 * For now, simply check to see if the page is in the page cache, 58 * For now, simply check to see if the page is in the page cache,
@@ -49,145 +83,150 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
49 return present; 83 return present;
50} 84}
51 85
52/* 86static void mincore_unmapped_range(struct vm_area_struct *vma,
53 * Do a chunk of "sys_mincore()". We've already checked 87 unsigned long addr, unsigned long end,
54 * all the arguments, we hold the mmap semaphore: we should 88 unsigned char *vec)
55 * just return the amount of info we're asked for.
56 */
57static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
58{ 89{
59 pgd_t *pgd; 90 unsigned long nr = (end - addr) >> PAGE_SHIFT;
60 pud_t *pud;
61 pmd_t *pmd;
62 pte_t *ptep;
63 spinlock_t *ptl;
64 unsigned long nr;
65 int i; 91 int i;
66 pgoff_t pgoff;
67 struct vm_area_struct *vma = find_vma(current->mm, addr);
68 92
69 /* 93 if (vma->vm_file) {
70 * find_vma() didn't find anything above us, or we're 94 pgoff_t pgoff;
71 * in an unmapped hole in the address space: ENOMEM.
72 */
73 if (!vma || addr < vma->vm_start)
74 return -ENOMEM;
75
76#ifdef CONFIG_HUGETLB_PAGE
77 if (is_vm_hugetlb_page(vma)) {
78 struct hstate *h;
79 unsigned long nr_huge;
80 unsigned char present;
81 95
82 i = 0; 96 pgoff = linear_page_index(vma, addr);
83 nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT); 97 for (i = 0; i < nr; i++, pgoff++)
84 h = hstate_vma(vma); 98 vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
85 nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h)) 99 } else {
86 - (addr >> huge_page_shift(h)) + 1; 100 for (i = 0; i < nr; i++)
87 nr_huge = min(nr_huge, 101 vec[i] = 0;
88 (vma->vm_end - addr) >> huge_page_shift(h));
89 while (1) {
90 /* hugepage always in RAM for now,
91 * but generally it needs to be check */
92 ptep = huge_pte_offset(current->mm,
93 addr & huge_page_mask(h));
94 present = !!(ptep &&
95 !huge_pte_none(huge_ptep_get(ptep)));
96 while (1) {
97 vec[i++] = present;
98 addr += PAGE_SIZE;
99 /* reach buffer limit */
100 if (i == nr)
101 return nr;
102 /* check hugepage border */
103 if (!((addr & ~huge_page_mask(h))
104 >> PAGE_SHIFT))
105 break;
106 }
107 }
108 return nr;
109 } 102 }
110#endif 103}
111
112 /*
113 * Calculate how many pages there are left in the last level of the
114 * PTE array for our address.
115 */
116 nr = PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1));
117
118 /*
119 * Don't overrun this vma
120 */
121 nr = min(nr, (vma->vm_end - addr) >> PAGE_SHIFT);
122
123 /*
124 * Don't return more than the caller asked for
125 */
126 nr = min(nr, pages);
127 104
128 pgd = pgd_offset(vma->vm_mm, addr); 105static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
129 if (pgd_none_or_clear_bad(pgd)) 106 unsigned long addr, unsigned long end,
130 goto none_mapped; 107 unsigned char *vec)
131 pud = pud_offset(pgd, addr); 108{
132 if (pud_none_or_clear_bad(pud)) 109 unsigned long next;
133 goto none_mapped; 110 spinlock_t *ptl;
134 pmd = pmd_offset(pud, addr); 111 pte_t *ptep;
135 if (pmd_none_or_clear_bad(pmd))
136 goto none_mapped;
137 112
138 ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 113 ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
139 for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) { 114 do {
140 unsigned char present;
141 pte_t pte = *ptep; 115 pte_t pte = *ptep;
116 pgoff_t pgoff;
142 117
143 if (pte_present(pte)) { 118 next = addr + PAGE_SIZE;
144 present = 1; 119 if (pte_none(pte))
145 120 mincore_unmapped_range(vma, addr, next, vec);
146 } else if (pte_none(pte)) { 121 else if (pte_present(pte))
147 if (vma->vm_file) { 122 *vec = 1;
148 pgoff = linear_page_index(vma, addr); 123 else if (pte_file(pte)) {
149 present = mincore_page(vma->vm_file->f_mapping,
150 pgoff);
151 } else
152 present = 0;
153
154 } else if (pte_file(pte)) {
155 pgoff = pte_to_pgoff(pte); 124 pgoff = pte_to_pgoff(pte);
156 present = mincore_page(vma->vm_file->f_mapping, pgoff); 125 *vec = mincore_page(vma->vm_file->f_mapping, pgoff);
157
158 } else { /* pte is a swap entry */ 126 } else { /* pte is a swap entry */
159 swp_entry_t entry = pte_to_swp_entry(pte); 127 swp_entry_t entry = pte_to_swp_entry(pte);
128
160 if (is_migration_entry(entry)) { 129 if (is_migration_entry(entry)) {
161 /* migration entries are always uptodate */ 130 /* migration entries are always uptodate */
162 present = 1; 131 *vec = 1;
163 } else { 132 } else {
164#ifdef CONFIG_SWAP 133#ifdef CONFIG_SWAP
165 pgoff = entry.val; 134 pgoff = entry.val;
166 present = mincore_page(&swapper_space, pgoff); 135 *vec = mincore_page(&swapper_space, pgoff);
167#else 136#else
168 WARN_ON(1); 137 WARN_ON(1);
169 present = 1; 138 *vec = 1;
170#endif 139#endif
171 } 140 }
172 } 141 }
142 vec++;
143 } while (ptep++, addr = next, addr != end);
144 pte_unmap_unlock(ptep - 1, ptl);
145}
173 146
174 vec[i] = present; 147static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
175 } 148 unsigned long addr, unsigned long end,
176 pte_unmap_unlock(ptep-1, ptl); 149 unsigned char *vec)
150{
151 unsigned long next;
152 pmd_t *pmd;
177 153
178 return nr; 154 pmd = pmd_offset(pud, addr);
155 do {
156 next = pmd_addr_end(addr, end);
157 if (pmd_none_or_clear_bad(pmd))
158 mincore_unmapped_range(vma, addr, next, vec);
159 else
160 mincore_pte_range(vma, pmd, addr, next, vec);
161 vec += (next - addr) >> PAGE_SHIFT;
162 } while (pmd++, addr = next, addr != end);
163}
179 164
180none_mapped: 165static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
181 if (vma->vm_file) { 166 unsigned long addr, unsigned long end,
182 pgoff = linear_page_index(vma, addr); 167 unsigned char *vec)
183 for (i = 0; i < nr; i++, pgoff++) 168{
184 vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); 169 unsigned long next;
185 } else { 170 pud_t *pud;
186 for (i = 0; i < nr; i++) 171
187 vec[i] = 0; 172 pud = pud_offset(pgd, addr);
173 do {
174 next = pud_addr_end(addr, end);
175 if (pud_none_or_clear_bad(pud))
176 mincore_unmapped_range(vma, addr, next, vec);
177 else
178 mincore_pmd_range(vma, pud, addr, next, vec);
179 vec += (next - addr) >> PAGE_SHIFT;
180 } while (pud++, addr = next, addr != end);
181}
182
183static void mincore_page_range(struct vm_area_struct *vma,
184 unsigned long addr, unsigned long end,
185 unsigned char *vec)
186{
187 unsigned long next;
188 pgd_t *pgd;
189
190 pgd = pgd_offset(vma->vm_mm, addr);
191 do {
192 next = pgd_addr_end(addr, end);
193 if (pgd_none_or_clear_bad(pgd))
194 mincore_unmapped_range(vma, addr, next, vec);
195 else
196 mincore_pud_range(vma, pgd, addr, next, vec);
197 vec += (next - addr) >> PAGE_SHIFT;
198 } while (pgd++, addr = next, addr != end);
199}
200
201/*
202 * Do a chunk of "sys_mincore()". We've already checked
203 * all the arguments, we hold the mmap semaphore: we should
204 * just return the amount of info we're asked for.
205 */
206static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec)
207{
208 struct vm_area_struct *vma;
209 unsigned long end;
210
211 vma = find_vma(current->mm, addr);
212 if (!vma || addr < vma->vm_start)
213 return -ENOMEM;
214
215 end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
216
217 if (is_vm_hugetlb_page(vma)) {
218 mincore_hugetlb_page_range(vma, addr, end, vec);
219 return (end - addr) >> PAGE_SHIFT;
188 } 220 }
189 221
190 return nr; 222 end = pmd_addr_end(addr, end);
223
224 if (is_vm_hugetlb_page(vma))
225 mincore_hugetlb_page_range(vma, addr, end, vec);
226 else
227 mincore_page_range(vma, addr, end, vec);
228
229 return (end - addr) >> PAGE_SHIFT;
191} 230}
192 231
193/* 232/*
@@ -247,7 +286,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
247 * the temporary buffer size. 286 * the temporary buffer size.
248 */ 287 */
249 down_read(&current->mm->mmap_sem); 288 down_read(&current->mm->mmap_sem);
250 retval = do_mincore(start, tmp, min(pages, PAGE_SIZE)); 289 retval = do_mincore(start, min(pages, PAGE_SIZE), tmp);
251 up_read(&current->mm->mmap_sem); 290 up_read(&current->mm->mmap_sem);
252 291
253 if (retval <= 0) 292 if (retval <= 0)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a6326c71b663..08b349931ebc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -49,6 +49,7 @@
49#include <linux/debugobjects.h> 49#include <linux/debugobjects.h>
50#include <linux/kmemleak.h> 50#include <linux/kmemleak.h>
51#include <linux/memory.h> 51#include <linux/memory.h>
52#include <linux/compaction.h>
52#include <trace/events/kmem.h> 53#include <trace/events/kmem.h>
53#include <linux/ftrace_event.h> 54#include <linux/ftrace_event.h>
54 55
@@ -475,6 +476,8 @@ static inline void __free_one_page(struct page *page,
475 int migratetype) 476 int migratetype)
476{ 477{
477 unsigned long page_idx; 478 unsigned long page_idx;
479 unsigned long combined_idx;
480 struct page *buddy;
478 481
479 if (unlikely(PageCompound(page))) 482 if (unlikely(PageCompound(page)))
480 if (unlikely(destroy_compound_page(page, order))) 483 if (unlikely(destroy_compound_page(page, order)))
@@ -488,9 +491,6 @@ static inline void __free_one_page(struct page *page,
488 VM_BUG_ON(bad_range(zone, page)); 491 VM_BUG_ON(bad_range(zone, page));
489 492
490 while (order < MAX_ORDER-1) { 493 while (order < MAX_ORDER-1) {
491 unsigned long combined_idx;
492 struct page *buddy;
493
494 buddy = __page_find_buddy(page, page_idx, order); 494 buddy = __page_find_buddy(page, page_idx, order);
495 if (!page_is_buddy(page, buddy, order)) 495 if (!page_is_buddy(page, buddy, order))
496 break; 496 break;
@@ -505,8 +505,29 @@ static inline void __free_one_page(struct page *page,
505 order++; 505 order++;
506 } 506 }
507 set_page_order(page, order); 507 set_page_order(page, order);
508 list_add(&page->lru, 508
509 &zone->free_area[order].free_list[migratetype]); 509 /*
510 * If this is not the largest possible page, check if the buddy
511 * of the next-highest order is free. If it is, it's possible
512 * that pages are being freed that will coalesce soon. In case,
513 * that is happening, add the free page to the tail of the list
514 * so it's less likely to be used soon and more likely to be merged
515 * as a higher order page
516 */
517 if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
518 struct page *higher_page, *higher_buddy;
519 combined_idx = __find_combined_index(page_idx, order);
520 higher_page = page + combined_idx - page_idx;
521 higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
522 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
523 list_add_tail(&page->lru,
524 &zone->free_area[order].free_list[migratetype]);
525 goto out;
526 }
527 }
528
529 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
530out:
510 zone->free_area[order].nr_free++; 531 zone->free_area[order].nr_free++;
511} 532}
512 533
@@ -599,20 +620,23 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
599 spin_unlock(&zone->lock); 620 spin_unlock(&zone->lock);
600} 621}
601 622
602static void __free_pages_ok(struct page *page, unsigned int order) 623static bool free_pages_prepare(struct page *page, unsigned int order)
603{ 624{
604 unsigned long flags;
605 int i; 625 int i;
606 int bad = 0; 626 int bad = 0;
607 int wasMlocked = __TestClearPageMlocked(page);
608 627
609 trace_mm_page_free_direct(page, order); 628 trace_mm_page_free_direct(page, order);
610 kmemcheck_free_shadow(page, order); 629 kmemcheck_free_shadow(page, order);
611 630
612 for (i = 0 ; i < (1 << order) ; ++i) 631 for (i = 0; i < (1 << order); i++) {
613 bad += free_pages_check(page + i); 632 struct page *pg = page + i;
633
634 if (PageAnon(pg))
635 pg->mapping = NULL;
636 bad += free_pages_check(pg);
637 }
614 if (bad) 638 if (bad)
615 return; 639 return false;
616 640
617 if (!PageHighMem(page)) { 641 if (!PageHighMem(page)) {
618 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); 642 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
@@ -622,6 +646,17 @@ static void __free_pages_ok(struct page *page, unsigned int order)
622 arch_free_page(page, order); 646 arch_free_page(page, order);
623 kernel_map_pages(page, 1 << order, 0); 647 kernel_map_pages(page, 1 << order, 0);
624 648
649 return true;
650}
651
652static void __free_pages_ok(struct page *page, unsigned int order)
653{
654 unsigned long flags;
655 int wasMlocked = __TestClearPageMlocked(page);
656
657 if (!free_pages_prepare(page, order))
658 return;
659
625 local_irq_save(flags); 660 local_irq_save(flags);
626 if (unlikely(wasMlocked)) 661 if (unlikely(wasMlocked))
627 free_page_mlock(page); 662 free_page_mlock(page);
@@ -1107,21 +1142,9 @@ void free_hot_cold_page(struct page *page, int cold)
1107 int migratetype; 1142 int migratetype;
1108 int wasMlocked = __TestClearPageMlocked(page); 1143 int wasMlocked = __TestClearPageMlocked(page);
1109 1144
1110 trace_mm_page_free_direct(page, 0); 1145 if (!free_pages_prepare(page, 0))
1111 kmemcheck_free_shadow(page, 0);
1112
1113 if (PageAnon(page))
1114 page->mapping = NULL;
1115 if (free_pages_check(page))
1116 return; 1146 return;
1117 1147
1118 if (!PageHighMem(page)) {
1119 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
1120 debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
1121 }
1122 arch_free_page(page, 0);
1123 kernel_map_pages(page, 1, 0);
1124
1125 migratetype = get_pageblock_migratetype(page); 1148 migratetype = get_pageblock_migratetype(page);
1126 set_page_private(page, migratetype); 1149 set_page_private(page, migratetype);
1127 local_irq_save(flags); 1150 local_irq_save(flags);
@@ -1188,6 +1211,51 @@ void split_page(struct page *page, unsigned int order)
1188} 1211}
1189 1212
1190/* 1213/*
1214 * Similar to split_page except the page is already free. As this is only
1215 * being used for migration, the migratetype of the block also changes.
1216 * As this is called with interrupts disabled, the caller is responsible
1217 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1218 * are enabled.
1219 *
1220 * Note: this is probably too low level an operation for use in drivers.
1221 * Please consult with lkml before using this in your driver.
1222 */
1223int split_free_page(struct page *page)
1224{
1225 unsigned int order;
1226 unsigned long watermark;
1227 struct zone *zone;
1228
1229 BUG_ON(!PageBuddy(page));
1230
1231 zone = page_zone(page);
1232 order = page_order(page);
1233
1234 /* Obey watermarks as if the page was being allocated */
1235 watermark = low_wmark_pages(zone) + (1 << order);
1236 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1237 return 0;
1238
1239 /* Remove page from free list */
1240 list_del(&page->lru);
1241 zone->free_area[order].nr_free--;
1242 rmv_page_order(page);
1243 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
1244
1245 /* Split into individual pages */
1246 set_page_refcounted(page);
1247 split_page(page, order);
1248
1249 if (order >= pageblock_order - 1) {
1250 struct page *endpage = page + (1 << order) - 1;
1251 for (; page < endpage; page += pageblock_nr_pages)
1252 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1253 }
1254
1255 return 1 << order;
1256}
1257
1258/*
1191 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1259 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
1192 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1260 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1193 * or two. 1261 * or two.
@@ -1693,6 +1761,62 @@ out:
1693 return page; 1761 return page;
1694} 1762}
1695 1763
1764#ifdef CONFIG_COMPACTION
1765/* Try memory compaction for high-order allocations before reclaim */
1766static struct page *
1767__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1768 struct zonelist *zonelist, enum zone_type high_zoneidx,
1769 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1770 int migratetype, unsigned long *did_some_progress)
1771{
1772 struct page *page;
1773
1774 if (!order || compaction_deferred(preferred_zone))
1775 return NULL;
1776
1777 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1778 nodemask);
1779 if (*did_some_progress != COMPACT_SKIPPED) {
1780
1781 /* Page migration frees to the PCP lists but we want merging */
1782 drain_pages(get_cpu());
1783 put_cpu();
1784
1785 page = get_page_from_freelist(gfp_mask, nodemask,
1786 order, zonelist, high_zoneidx,
1787 alloc_flags, preferred_zone,
1788 migratetype);
1789 if (page) {
1790 preferred_zone->compact_considered = 0;
1791 preferred_zone->compact_defer_shift = 0;
1792 count_vm_event(COMPACTSUCCESS);
1793 return page;
1794 }
1795
1796 /*
1797 * It's bad if compaction run occurs and fails.
1798 * The most likely reason is that pages exist,
1799 * but not enough to satisfy watermarks.
1800 */
1801 count_vm_event(COMPACTFAIL);
1802 defer_compaction(preferred_zone);
1803
1804 cond_resched();
1805 }
1806
1807 return NULL;
1808}
1809#else
1810static inline struct page *
1811__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1812 struct zonelist *zonelist, enum zone_type high_zoneidx,
1813 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1814 int migratetype, unsigned long *did_some_progress)
1815{
1816 return NULL;
1817}
1818#endif /* CONFIG_COMPACTION */
1819
1696/* The really slow allocator path where we enter direct reclaim */ 1820/* The really slow allocator path where we enter direct reclaim */
1697static inline struct page * 1821static inline struct page *
1698__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 1822__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -1879,6 +2003,15 @@ rebalance:
1879 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2003 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
1880 goto nopage; 2004 goto nopage;
1881 2005
2006 /* Try direct compaction */
2007 page = __alloc_pages_direct_compact(gfp_mask, order,
2008 zonelist, high_zoneidx,
2009 nodemask,
2010 alloc_flags, preferred_zone,
2011 migratetype, &did_some_progress);
2012 if (page)
2013 goto got_pg;
2014
1882 /* Try direct reclaim and then allocating */ 2015 /* Try direct reclaim and then allocating */
1883 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2016 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1884 zonelist, high_zoneidx, 2017 zonelist, high_zoneidx,
@@ -1970,10 +2103,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1970 if (unlikely(!zonelist->_zonerefs->zone)) 2103 if (unlikely(!zonelist->_zonerefs->zone))
1971 return NULL; 2104 return NULL;
1972 2105
2106 get_mems_allowed();
1973 /* The preferred zone is used for statistics later */ 2107 /* The preferred zone is used for statistics later */
1974 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); 2108 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
1975 if (!preferred_zone) 2109 if (!preferred_zone) {
2110 put_mems_allowed();
1976 return NULL; 2111 return NULL;
2112 }
1977 2113
1978 /* First allocation attempt */ 2114 /* First allocation attempt */
1979 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2115 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -1983,6 +2119,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1983 page = __alloc_pages_slowpath(gfp_mask, order, 2119 page = __alloc_pages_slowpath(gfp_mask, order,
1984 zonelist, high_zoneidx, nodemask, 2120 zonelist, high_zoneidx, nodemask,
1985 preferred_zone, migratetype); 2121 preferred_zone, migratetype);
2122 put_mems_allowed();
1986 2123
1987 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2124 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
1988 return page; 2125 return page;
@@ -2434,8 +2571,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2434 strncpy((char*)table->data, saved_string, 2571 strncpy((char*)table->data, saved_string,
2435 NUMA_ZONELIST_ORDER_LEN); 2572 NUMA_ZONELIST_ORDER_LEN);
2436 user_zonelist_order = oldval; 2573 user_zonelist_order = oldval;
2437 } else if (oldval != user_zonelist_order) 2574 } else if (oldval != user_zonelist_order) {
2438 build_all_zonelists(); 2575 mutex_lock(&zonelists_mutex);
2576 build_all_zonelists(NULL);
2577 mutex_unlock(&zonelists_mutex);
2578 }
2439 } 2579 }
2440out: 2580out:
2441 mutex_unlock(&zl_order_mutex); 2581 mutex_unlock(&zl_order_mutex);
@@ -2582,7 +2722,7 @@ static int default_zonelist_order(void)
2582 * ZONE_DMA and ZONE_DMA32 can be very small area in the system. 2722 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
2583 * If they are really small and used heavily, the system can fall 2723 * If they are really small and used heavily, the system can fall
2584 * into OOM very easily. 2724 * into OOM very easily.
2585 * This function detect ZONE_DMA/DMA32 size and confgigures zone order. 2725 * This function detect ZONE_DMA/DMA32 size and configures zone order.
2586 */ 2726 */
2587 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ 2727 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
2588 low_kmem_size = 0; 2728 low_kmem_size = 0;
@@ -2594,6 +2734,15 @@ static int default_zonelist_order(void)
2594 if (zone_type < ZONE_NORMAL) 2734 if (zone_type < ZONE_NORMAL)
2595 low_kmem_size += z->present_pages; 2735 low_kmem_size += z->present_pages;
2596 total_size += z->present_pages; 2736 total_size += z->present_pages;
2737 } else if (zone_type == ZONE_NORMAL) {
2738 /*
2739 * If any node has only lowmem, then node order
2740 * is preferred to allow kernel allocations
2741 * locally; otherwise, they can easily infringe
2742 * on other nodes when there is an abundance of
2743 * lowmem available to allocate from.
2744 */
2745 return ZONELIST_ORDER_NODE;
2597 } 2746 }
2598 } 2747 }
2599 } 2748 }
@@ -2776,9 +2925,16 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2776 */ 2925 */
2777static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 2926static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
2778static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 2927static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
2928static void setup_zone_pageset(struct zone *zone);
2929
2930/*
2931 * Global mutex to protect against size modification of zonelists
2932 * as well as to serialize pageset setup for the new populated zone.
2933 */
2934DEFINE_MUTEX(zonelists_mutex);
2779 2935
2780/* return values int ....just for stop_machine() */ 2936/* return values int ....just for stop_machine() */
2781static int __build_all_zonelists(void *dummy) 2937static __init_refok int __build_all_zonelists(void *data)
2782{ 2938{
2783 int nid; 2939 int nid;
2784 int cpu; 2940 int cpu;
@@ -2793,6 +2949,14 @@ static int __build_all_zonelists(void *dummy)
2793 build_zonelist_cache(pgdat); 2949 build_zonelist_cache(pgdat);
2794 } 2950 }
2795 2951
2952#ifdef CONFIG_MEMORY_HOTPLUG
2953 /* Setup real pagesets for the new zone */
2954 if (data) {
2955 struct zone *zone = data;
2956 setup_zone_pageset(zone);
2957 }
2958#endif
2959
2796 /* 2960 /*
2797 * Initialize the boot_pagesets that are going to be used 2961 * Initialize the boot_pagesets that are going to be used
2798 * for bootstrapping processors. The real pagesets for 2962 * for bootstrapping processors. The real pagesets for
@@ -2812,7 +2976,11 @@ static int __build_all_zonelists(void *dummy)
2812 return 0; 2976 return 0;
2813} 2977}
2814 2978
2815void build_all_zonelists(void) 2979/*
2980 * Called with zonelists_mutex held always
2981 * unless system_state == SYSTEM_BOOTING.
2982 */
2983void build_all_zonelists(void *data)
2816{ 2984{
2817 set_zonelist_order(); 2985 set_zonelist_order();
2818 2986
@@ -2823,7 +2991,7 @@ void build_all_zonelists(void)
2823 } else { 2991 } else {
2824 /* we have to stop all cpus to guarantee there is no user 2992 /* we have to stop all cpus to guarantee there is no user
2825 of zonelist */ 2993 of zonelist */
2826 stop_machine(__build_all_zonelists, NULL, NULL); 2994 stop_machine(__build_all_zonelists, data, NULL);
2827 /* cpuset refresh routine should be here */ 2995 /* cpuset refresh routine should be here */
2828 } 2996 }
2829 vm_total_pages = nr_free_pagecache_pages(); 2997 vm_total_pages = nr_free_pagecache_pages();
@@ -3146,31 +3314,34 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3146 pcp->batch = PAGE_SHIFT * 8; 3314 pcp->batch = PAGE_SHIFT * 8;
3147} 3315}
3148 3316
3317static __meminit void setup_zone_pageset(struct zone *zone)
3318{
3319 int cpu;
3320
3321 zone->pageset = alloc_percpu(struct per_cpu_pageset);
3322
3323 for_each_possible_cpu(cpu) {
3324 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3325
3326 setup_pageset(pcp, zone_batchsize(zone));
3327
3328 if (percpu_pagelist_fraction)
3329 setup_pagelist_highmark(pcp,
3330 (zone->present_pages /
3331 percpu_pagelist_fraction));
3332 }
3333}
3334
3149/* 3335/*
3150 * Allocate per cpu pagesets and initialize them. 3336 * Allocate per cpu pagesets and initialize them.
3151 * Before this call only boot pagesets were available. 3337 * Before this call only boot pagesets were available.
3152 * Boot pagesets will no longer be used by this processorr
3153 * after setup_per_cpu_pageset().
3154 */ 3338 */
3155void __init setup_per_cpu_pageset(void) 3339void __init setup_per_cpu_pageset(void)
3156{ 3340{
3157 struct zone *zone; 3341 struct zone *zone;
3158 int cpu;
3159 3342
3160 for_each_populated_zone(zone) { 3343 for_each_populated_zone(zone)
3161 zone->pageset = alloc_percpu(struct per_cpu_pageset); 3344 setup_zone_pageset(zone);
3162
3163 for_each_possible_cpu(cpu) {
3164 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3165
3166 setup_pageset(pcp, zone_batchsize(zone));
3167
3168 if (percpu_pagelist_fraction)
3169 setup_pagelist_highmark(pcp,
3170 (zone->present_pages /
3171 percpu_pagelist_fraction));
3172 }
3173 }
3174} 3345}
3175 3346
3176static noinline __init_refok 3347static noinline __init_refok
diff --git a/mm/readahead.c b/mm/readahead.c
index dfa9a1a03a11..77506a291a2d 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -523,7 +523,7 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
523 * @req_size: hint: total size of the read which the caller is performing in 523 * @req_size: hint: total size of the read which the caller is performing in
524 * pagecache pages 524 * pagecache pages
525 * 525 *
526 * page_cache_async_ondemand() should be called when a page is used which 526 * page_cache_async_readahead() should be called when a page is used which
527 * has the PG_readahead flag; this is a marker to suggest that the application 527 * has the PG_readahead flag; this is a marker to suggest that the application
528 * has used up enough of the readahead window that we should start pulling in 528 * has used up enough of the readahead window that we should start pulling in
529 * more pages. 529 * more pages.
diff --git a/mm/rmap.c b/mm/rmap.c
index 0feeef860a8f..38a336e2eea1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -250,7 +250,7 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
250 list_del(&anon_vma_chain->same_anon_vma); 250 list_del(&anon_vma_chain->same_anon_vma);
251 251
252 /* We must garbage collect the anon_vma if it's empty */ 252 /* We must garbage collect the anon_vma if it's empty */
253 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); 253 empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
254 spin_unlock(&anon_vma->lock); 254 spin_unlock(&anon_vma->lock);
255 255
256 if (empty) 256 if (empty)
@@ -274,7 +274,7 @@ static void anon_vma_ctor(void *data)
274 struct anon_vma *anon_vma = data; 274 struct anon_vma *anon_vma = data;
275 275
276 spin_lock_init(&anon_vma->lock); 276 spin_lock_init(&anon_vma->lock);
277 ksm_refcount_init(anon_vma); 277 anonvma_external_refcount_init(anon_vma);
278 INIT_LIST_HEAD(&anon_vma->head); 278 INIT_LIST_HEAD(&anon_vma->head);
279} 279}
280 280
@@ -1131,6 +1131,20 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1131 return ret; 1131 return ret;
1132} 1132}
1133 1133
1134static bool is_vma_temporary_stack(struct vm_area_struct *vma)
1135{
1136 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1137
1138 if (!maybe_stack)
1139 return false;
1140
1141 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
1142 VM_STACK_INCOMPLETE_SETUP)
1143 return true;
1144
1145 return false;
1146}
1147
1134/** 1148/**
1135 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based 1149 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
1136 * rmap method 1150 * rmap method
@@ -1159,7 +1173,21 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1159 1173
1160 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1174 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1161 struct vm_area_struct *vma = avc->vma; 1175 struct vm_area_struct *vma = avc->vma;
1162 unsigned long address = vma_address(page, vma); 1176 unsigned long address;
1177
1178 /*
1179 * During exec, a temporary VMA is setup and later moved.
1180 * The VMA is moved under the anon_vma lock but not the
1181 * page tables leading to a race where migration cannot
1182 * find the migration ptes. Rather than increasing the
1183 * locking requirements of exec(), migration skips
1184 * temporary VMAs until after exec() completes.
1185 */
1186 if (PAGE_MIGRATION && (flags & TTU_MIGRATION) &&
1187 is_vma_temporary_stack(vma))
1188 continue;
1189
1190 address = vma_address(page, vma);
1163 if (address == -EFAULT) 1191 if (address == -EFAULT)
1164 continue; 1192 continue;
1165 ret = try_to_unmap_one(page, vma, address, flags); 1193 ret = try_to_unmap_one(page, vma, address, flags);
@@ -1355,10 +1383,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1355 /* 1383 /*
1356 * Note: remove_migration_ptes() cannot use page_lock_anon_vma() 1384 * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
1357 * because that depends on page_mapped(); but not all its usages 1385 * because that depends on page_mapped(); but not all its usages
1358 * are holding mmap_sem, which also gave the necessary guarantee 1386 * are holding mmap_sem. Users without mmap_sem are required to
1359 * (that this anon_vma's slab has not already been destroyed). 1387 * take a reference count to prevent the anon_vma disappearing
1360 * This needs to be reviewed later: avoiding page_lock_anon_vma()
1361 * is risky, and currently limits the usefulness of rmap_walk().
1362 */ 1388 */
1363 anon_vma = page_anon_vma(page); 1389 anon_vma = page_anon_vma(page);
1364 if (!anon_vma) 1390 if (!anon_vma)
diff --git a/mm/shmem.c b/mm/shmem.c
index 0cd7f66f1c66..4ef9797bd430 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -433,8 +433,6 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
433 433
434 spin_unlock(&info->lock); 434 spin_unlock(&info->lock);
435 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); 435 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
436 if (page)
437 set_page_private(page, 0);
438 spin_lock(&info->lock); 436 spin_lock(&info->lock);
439 437
440 if (!page) { 438 if (!page) {
diff --git a/mm/slab.c b/mm/slab.c
index 50a73fca19c4..02786e1a32d2 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3217,10 +3217,12 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3217 if (in_interrupt() || (flags & __GFP_THISNODE)) 3217 if (in_interrupt() || (flags & __GFP_THISNODE))
3218 return NULL; 3218 return NULL;
3219 nid_alloc = nid_here = numa_node_id(); 3219 nid_alloc = nid_here = numa_node_id();
3220 get_mems_allowed();
3220 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3221 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3221 nid_alloc = cpuset_mem_spread_node(); 3222 nid_alloc = cpuset_mem_spread_node();
3222 else if (current->mempolicy) 3223 else if (current->mempolicy)
3223 nid_alloc = slab_node(current->mempolicy); 3224 nid_alloc = slab_node(current->mempolicy);
3225 put_mems_allowed();
3224 if (nid_alloc != nid_here) 3226 if (nid_alloc != nid_here)
3225 return ____cache_alloc_node(cachep, flags, nid_alloc); 3227 return ____cache_alloc_node(cachep, flags, nid_alloc);
3226 return NULL; 3228 return NULL;
@@ -3247,6 +3249,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3247 if (flags & __GFP_THISNODE) 3249 if (flags & __GFP_THISNODE)
3248 return NULL; 3250 return NULL;
3249 3251
3252 get_mems_allowed();
3250 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 3253 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3251 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 3254 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3252 3255
@@ -3302,6 +3305,7 @@ retry:
3302 } 3305 }
3303 } 3306 }
3304 } 3307 }
3308 put_mems_allowed();
3305 return obj; 3309 return obj;
3306} 3310}
3307 3311
diff --git a/mm/slub.c b/mm/slub.c
index e46e3129697d..26f0cb9cc584 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1360,6 +1360,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1360 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1360 get_cycles() % 1024 > s->remote_node_defrag_ratio)
1361 return NULL; 1361 return NULL;
1362 1362
1363 get_mems_allowed();
1363 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 1364 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1364 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1365 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1365 struct kmem_cache_node *n; 1366 struct kmem_cache_node *n;
@@ -1369,10 +1370,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1369 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1370 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1370 n->nr_partial > s->min_partial) { 1371 n->nr_partial > s->min_partial) {
1371 page = get_partial_node(n); 1372 page = get_partial_node(n);
1372 if (page) 1373 if (page) {
1374 put_mems_allowed();
1373 return page; 1375 return page;
1376 }
1374 } 1377 }
1375 } 1378 }
1379 put_mems_allowed();
1376#endif 1380#endif
1377 return NULL; 1381 return NULL;
1378} 1382}
diff --git a/mm/sparse.c b/mm/sparse.c
index dc0cc4d43ff3..95ac219af379 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -382,13 +382,15 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
382struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) 382struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
383{ 383{
384 struct page *map; 384 struct page *map;
385 unsigned long size;
385 386
386 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); 387 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
387 if (map) 388 if (map)
388 return map; 389 return map;
389 390
390 map = alloc_bootmem_pages_node(NODE_DATA(nid), 391 size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
391 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); 392 map = __alloc_bootmem_node_high(NODE_DATA(nid), size,
393 PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
392 return map; 394 return map;
393} 395}
394void __init sparse_mem_maps_populate_node(struct page **map_map, 396void __init sparse_mem_maps_populate_node(struct page **map_map,
@@ -412,7 +414,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
412 } 414 }
413 415
414 size = PAGE_ALIGN(size); 416 size = PAGE_ALIGN(size);
415 map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count); 417 map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count,
418 PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
416 if (map) { 419 if (map) {
417 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 420 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
418 if (!present_section_nr(pnum)) 421 if (!present_section_nr(pnum))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3ff3311447f5..915dceb487c1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -73,10 +73,14 @@ struct scan_control {
73 73
74 int swappiness; 74 int swappiness;
75 75
76 int all_unreclaimable;
77
78 int order; 76 int order;
79 77
78 /*
79 * Intend to reclaim enough contenious memory rather than to reclaim
80 * enough amount memory. I.e, it's the mode for high order allocation.
81 */
82 bool lumpy_reclaim_mode;
83
80 /* Which cgroup do we reclaim from */ 84 /* Which cgroup do we reclaim from */
81 struct mem_cgroup *mem_cgroup; 85 struct mem_cgroup *mem_cgroup;
82 86
@@ -85,12 +89,6 @@ struct scan_control {
85 * are scanned. 89 * are scanned.
86 */ 90 */
87 nodemask_t *nodemask; 91 nodemask_t *nodemask;
88
89 /* Pluggable isolate pages callback */
90 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
91 unsigned long *scanned, int order, int mode,
92 struct zone *z, struct mem_cgroup *mem_cont,
93 int active, int file);
94}; 92};
95 93
96#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 94#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -575,7 +573,7 @@ static enum page_references page_check_references(struct page *page,
575 referenced_page = TestClearPageReferenced(page); 573 referenced_page = TestClearPageReferenced(page);
576 574
577 /* Lumpy reclaim - ignore references */ 575 /* Lumpy reclaim - ignore references */
578 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 576 if (sc->lumpy_reclaim_mode)
579 return PAGEREF_RECLAIM; 577 return PAGEREF_RECLAIM;
580 578
581 /* 579 /*
@@ -839,11 +837,6 @@ keep:
839 return nr_reclaimed; 837 return nr_reclaimed;
840} 838}
841 839
842/* LRU Isolation modes. */
843#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */
844#define ISOLATE_ACTIVE 1 /* Isolate active pages. */
845#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */
846
847/* 840/*
848 * Attempt to remove the specified page from its LRU. Only take this page 841 * Attempt to remove the specified page from its LRU. Only take this page
849 * if it is of the appropriate PageActive status. Pages which are being 842 * if it is of the appropriate PageActive status. Pages which are being
@@ -1011,7 +1004,6 @@ static unsigned long isolate_pages_global(unsigned long nr,
1011 struct list_head *dst, 1004 struct list_head *dst,
1012 unsigned long *scanned, int order, 1005 unsigned long *scanned, int order,
1013 int mode, struct zone *z, 1006 int mode, struct zone *z,
1014 struct mem_cgroup *mem_cont,
1015 int active, int file) 1007 int active, int file)
1016{ 1008{
1017 int lru = LRU_BASE; 1009 int lru = LRU_BASE;
@@ -1130,7 +1122,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1130 unsigned long nr_scanned = 0; 1122 unsigned long nr_scanned = 0;
1131 unsigned long nr_reclaimed = 0; 1123 unsigned long nr_reclaimed = 0;
1132 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1124 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1133 int lumpy_reclaim = 0;
1134 1125
1135 while (unlikely(too_many_isolated(zone, file, sc))) { 1126 while (unlikely(too_many_isolated(zone, file, sc))) {
1136 congestion_wait(BLK_RW_ASYNC, HZ/10); 1127 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1140,17 +1131,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1140 return SWAP_CLUSTER_MAX; 1131 return SWAP_CLUSTER_MAX;
1141 } 1132 }
1142 1133
1143 /*
1144 * If we need a large contiguous chunk of memory, or have
1145 * trouble getting a small set of contiguous pages, we
1146 * will reclaim both active and inactive pages.
1147 *
1148 * We use the same threshold as pageout congestion_wait below.
1149 */
1150 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1151 lumpy_reclaim = 1;
1152 else if (sc->order && priority < DEF_PRIORITY - 2)
1153 lumpy_reclaim = 1;
1154 1134
1155 pagevec_init(&pvec, 1); 1135 pagevec_init(&pvec, 1);
1156 1136
@@ -1163,15 +1143,15 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1163 unsigned long nr_freed; 1143 unsigned long nr_freed;
1164 unsigned long nr_active; 1144 unsigned long nr_active;
1165 unsigned int count[NR_LRU_LISTS] = { 0, }; 1145 unsigned int count[NR_LRU_LISTS] = { 0, };
1166 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; 1146 int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1167 unsigned long nr_anon; 1147 unsigned long nr_anon;
1168 unsigned long nr_file; 1148 unsigned long nr_file;
1169 1149
1170 nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX,
1171 &page_list, &nr_scan, sc->order, mode,
1172 zone, sc->mem_cgroup, 0, file);
1173
1174 if (scanning_global_lru(sc)) { 1150 if (scanning_global_lru(sc)) {
1151 nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX,
1152 &page_list, &nr_scan,
1153 sc->order, mode,
1154 zone, 0, file);
1175 zone->pages_scanned += nr_scan; 1155 zone->pages_scanned += nr_scan;
1176 if (current_is_kswapd()) 1156 if (current_is_kswapd())
1177 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1157 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1179,6 +1159,16 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1179 else 1159 else
1180 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1160 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1181 nr_scan); 1161 nr_scan);
1162 } else {
1163 nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX,
1164 &page_list, &nr_scan,
1165 sc->order, mode,
1166 zone, sc->mem_cgroup,
1167 0, file);
1168 /*
1169 * mem_cgroup_isolate_pages() keeps track of
1170 * scanned pages on its own.
1171 */
1182 } 1172 }
1183 1173
1184 if (nr_taken == 0) 1174 if (nr_taken == 0)
@@ -1216,7 +1206,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1216 * but that should be acceptable to the caller 1206 * but that should be acceptable to the caller
1217 */ 1207 */
1218 if (nr_freed < nr_taken && !current_is_kswapd() && 1208 if (nr_freed < nr_taken && !current_is_kswapd() &&
1219 lumpy_reclaim) { 1209 sc->lumpy_reclaim_mode) {
1220 congestion_wait(BLK_RW_ASYNC, HZ/10); 1210 congestion_wait(BLK_RW_ASYNC, HZ/10);
1221 1211
1222 /* 1212 /*
@@ -1356,16 +1346,23 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1356 1346
1357 lru_add_drain(); 1347 lru_add_drain();
1358 spin_lock_irq(&zone->lru_lock); 1348 spin_lock_irq(&zone->lru_lock);
1359 nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1360 ISOLATE_ACTIVE, zone,
1361 sc->mem_cgroup, 1, file);
1362 /*
1363 * zone->pages_scanned is used for detect zone's oom
1364 * mem_cgroup remembers nr_scan by itself.
1365 */
1366 if (scanning_global_lru(sc)) { 1349 if (scanning_global_lru(sc)) {
1350 nr_taken = isolate_pages_global(nr_pages, &l_hold,
1351 &pgscanned, sc->order,
1352 ISOLATE_ACTIVE, zone,
1353 1, file);
1367 zone->pages_scanned += pgscanned; 1354 zone->pages_scanned += pgscanned;
1355 } else {
1356 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
1357 &pgscanned, sc->order,
1358 ISOLATE_ACTIVE, zone,
1359 sc->mem_cgroup, 1, file);
1360 /*
1361 * mem_cgroup_isolate_pages() keeps track of
1362 * scanned pages on its own.
1363 */
1368 } 1364 }
1365
1369 reclaim_stat->recent_scanned[file] += nr_taken; 1366 reclaim_stat->recent_scanned[file] += nr_taken;
1370 1367
1371 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1368 __count_zone_vm_events(PGREFILL, zone, pgscanned);
@@ -1519,21 +1516,52 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1519} 1516}
1520 1517
1521/* 1518/*
1519 * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
1520 * until we collected @swap_cluster_max pages to scan.
1521 */
1522static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1523 unsigned long *nr_saved_scan)
1524{
1525 unsigned long nr;
1526
1527 *nr_saved_scan += nr_to_scan;
1528 nr = *nr_saved_scan;
1529
1530 if (nr >= SWAP_CLUSTER_MAX)
1531 *nr_saved_scan = 0;
1532 else
1533 nr = 0;
1534
1535 return nr;
1536}
1537
1538/*
1522 * Determine how aggressively the anon and file LRU lists should be 1539 * Determine how aggressively the anon and file LRU lists should be
1523 * scanned. The relative value of each set of LRU lists is determined 1540 * scanned. The relative value of each set of LRU lists is determined
1524 * by looking at the fraction of the pages scanned we did rotate back 1541 * by looking at the fraction of the pages scanned we did rotate back
1525 * onto the active list instead of evict. 1542 * onto the active list instead of evict.
1526 * 1543 *
1527 * percent[0] specifies how much pressure to put on ram/swap backed 1544 * nr[0] = anon pages to scan; nr[1] = file pages to scan
1528 * memory, while percent[1] determines pressure on the file LRUs.
1529 */ 1545 */
1530static void get_scan_ratio(struct zone *zone, struct scan_control *sc, 1546static void get_scan_count(struct zone *zone, struct scan_control *sc,
1531 unsigned long *percent) 1547 unsigned long *nr, int priority)
1532{ 1548{
1533 unsigned long anon, file, free; 1549 unsigned long anon, file, free;
1534 unsigned long anon_prio, file_prio; 1550 unsigned long anon_prio, file_prio;
1535 unsigned long ap, fp; 1551 unsigned long ap, fp;
1536 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1552 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1553 u64 fraction[2], denominator;
1554 enum lru_list l;
1555 int noswap = 0;
1556
1557 /* If we have no swap space, do not bother scanning anon pages. */
1558 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1559 noswap = 1;
1560 fraction[0] = 0;
1561 fraction[1] = 1;
1562 denominator = 1;
1563 goto out;
1564 }
1537 1565
1538 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + 1566 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1539 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); 1567 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
@@ -1545,9 +1573,10 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1545 /* If we have very few page cache pages, 1573 /* If we have very few page cache pages,
1546 force-scan anon pages. */ 1574 force-scan anon pages. */
1547 if (unlikely(file + free <= high_wmark_pages(zone))) { 1575 if (unlikely(file + free <= high_wmark_pages(zone))) {
1548 percent[0] = 100; 1576 fraction[0] = 1;
1549 percent[1] = 0; 1577 fraction[1] = 0;
1550 return; 1578 denominator = 1;
1579 goto out;
1551 } 1580 }
1552 } 1581 }
1553 1582
@@ -1594,29 +1623,37 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1594 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); 1623 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1595 fp /= reclaim_stat->recent_rotated[1] + 1; 1624 fp /= reclaim_stat->recent_rotated[1] + 1;
1596 1625
1597 /* Normalize to percentages */ 1626 fraction[0] = ap;
1598 percent[0] = 100 * ap / (ap + fp + 1); 1627 fraction[1] = fp;
1599 percent[1] = 100 - percent[0]; 1628 denominator = ap + fp + 1;
1629out:
1630 for_each_evictable_lru(l) {
1631 int file = is_file_lru(l);
1632 unsigned long scan;
1633
1634 scan = zone_nr_lru_pages(zone, sc, l);
1635 if (priority || noswap) {
1636 scan >>= priority;
1637 scan = div64_u64(scan * fraction[file], denominator);
1638 }
1639 nr[l] = nr_scan_try_batch(scan,
1640 &reclaim_stat->nr_saved_scan[l]);
1641 }
1600} 1642}
1601 1643
1602/* 1644static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc)
1603 * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
1604 * until we collected @swap_cluster_max pages to scan.
1605 */
1606static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1607 unsigned long *nr_saved_scan)
1608{ 1645{
1609 unsigned long nr; 1646 /*
1610 1647 * If we need a large contiguous chunk of memory, or have
1611 *nr_saved_scan += nr_to_scan; 1648 * trouble getting a small set of contiguous pages, we
1612 nr = *nr_saved_scan; 1649 * will reclaim both active and inactive pages.
1613 1650 */
1614 if (nr >= SWAP_CLUSTER_MAX) 1651 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1615 *nr_saved_scan = 0; 1652 sc->lumpy_reclaim_mode = 1;
1653 else if (sc->order && priority < DEF_PRIORITY - 2)
1654 sc->lumpy_reclaim_mode = 1;
1616 else 1655 else
1617 nr = 0; 1656 sc->lumpy_reclaim_mode = 0;
1618
1619 return nr;
1620} 1657}
1621 1658
1622/* 1659/*
@@ -1627,33 +1664,13 @@ static void shrink_zone(int priority, struct zone *zone,
1627{ 1664{
1628 unsigned long nr[NR_LRU_LISTS]; 1665 unsigned long nr[NR_LRU_LISTS];
1629 unsigned long nr_to_scan; 1666 unsigned long nr_to_scan;
1630 unsigned long percent[2]; /* anon @ 0; file @ 1 */
1631 enum lru_list l; 1667 enum lru_list l;
1632 unsigned long nr_reclaimed = sc->nr_reclaimed; 1668 unsigned long nr_reclaimed = sc->nr_reclaimed;
1633 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1669 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1634 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1635 int noswap = 0;
1636
1637 /* If we have no swap space, do not bother scanning anon pages. */
1638 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1639 noswap = 1;
1640 percent[0] = 0;
1641 percent[1] = 100;
1642 } else
1643 get_scan_ratio(zone, sc, percent);
1644 1670
1645 for_each_evictable_lru(l) { 1671 get_scan_count(zone, sc, nr, priority);
1646 int file = is_file_lru(l);
1647 unsigned long scan;
1648 1672
1649 scan = zone_nr_lru_pages(zone, sc, l); 1673 set_lumpy_reclaim_mode(priority, sc);
1650 if (priority || noswap) {
1651 scan >>= priority;
1652 scan = (scan * percent[file]) / 100;
1653 }
1654 nr[l] = nr_scan_try_batch(scan,
1655 &reclaim_stat->nr_saved_scan[l]);
1656 }
1657 1674
1658 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1675 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1659 nr[LRU_INACTIVE_FILE]) { 1676 nr[LRU_INACTIVE_FILE]) {
@@ -1707,14 +1724,14 @@ static void shrink_zone(int priority, struct zone *zone,
1707 * If a zone is deemed to be full of pinned pages then just give it a light 1724 * If a zone is deemed to be full of pinned pages then just give it a light
1708 * scan then give up on it. 1725 * scan then give up on it.
1709 */ 1726 */
1710static void shrink_zones(int priority, struct zonelist *zonelist, 1727static int shrink_zones(int priority, struct zonelist *zonelist,
1711 struct scan_control *sc) 1728 struct scan_control *sc)
1712{ 1729{
1713 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1730 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1714 struct zoneref *z; 1731 struct zoneref *z;
1715 struct zone *zone; 1732 struct zone *zone;
1733 int progress = 0;
1716 1734
1717 sc->all_unreclaimable = 1;
1718 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 1735 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
1719 sc->nodemask) { 1736 sc->nodemask) {
1720 if (!populated_zone(zone)) 1737 if (!populated_zone(zone))
@@ -1730,19 +1747,19 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1730 1747
1731 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1748 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1732 continue; /* Let kswapd poll it */ 1749 continue; /* Let kswapd poll it */
1733 sc->all_unreclaimable = 0;
1734 } else { 1750 } else {
1735 /* 1751 /*
1736 * Ignore cpuset limitation here. We just want to reduce 1752 * Ignore cpuset limitation here. We just want to reduce
1737 * # of used pages by us regardless of memory shortage. 1753 * # of used pages by us regardless of memory shortage.
1738 */ 1754 */
1739 sc->all_unreclaimable = 0;
1740 mem_cgroup_note_reclaim_priority(sc->mem_cgroup, 1755 mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
1741 priority); 1756 priority);
1742 } 1757 }
1743 1758
1744 shrink_zone(priority, zone, sc); 1759 shrink_zone(priority, zone, sc);
1760 progress = 1;
1745 } 1761 }
1762 return progress;
1746} 1763}
1747 1764
1748/* 1765/*
@@ -1774,6 +1791,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1774 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1791 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1775 unsigned long writeback_threshold; 1792 unsigned long writeback_threshold;
1776 1793
1794 get_mems_allowed();
1777 delayacct_freepages_start(); 1795 delayacct_freepages_start();
1778 1796
1779 if (scanning_global_lru(sc)) 1797 if (scanning_global_lru(sc))
@@ -1795,7 +1813,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1795 sc->nr_scanned = 0; 1813 sc->nr_scanned = 0;
1796 if (!priority) 1814 if (!priority)
1797 disable_swap_token(); 1815 disable_swap_token();
1798 shrink_zones(priority, zonelist, sc); 1816 ret = shrink_zones(priority, zonelist, sc);
1799 /* 1817 /*
1800 * Don't shrink slabs when reclaiming memory from 1818 * Don't shrink slabs when reclaiming memory from
1801 * over limit cgroups 1819 * over limit cgroups
@@ -1832,7 +1850,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1832 congestion_wait(BLK_RW_ASYNC, HZ/10); 1850 congestion_wait(BLK_RW_ASYNC, HZ/10);
1833 } 1851 }
1834 /* top priority shrink_zones still had more to do? don't OOM, then */ 1852 /* top priority shrink_zones still had more to do? don't OOM, then */
1835 if (!sc->all_unreclaimable && scanning_global_lru(sc)) 1853 if (ret && scanning_global_lru(sc))
1836 ret = sc->nr_reclaimed; 1854 ret = sc->nr_reclaimed;
1837out: 1855out:
1838 /* 1856 /*
@@ -1857,6 +1875,7 @@ out:
1857 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); 1875 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1858 1876
1859 delayacct_freepages_end(); 1877 delayacct_freepages_end();
1878 put_mems_allowed();
1860 1879
1861 return ret; 1880 return ret;
1862} 1881}
@@ -1873,7 +1892,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1873 .swappiness = vm_swappiness, 1892 .swappiness = vm_swappiness,
1874 .order = order, 1893 .order = order,
1875 .mem_cgroup = NULL, 1894 .mem_cgroup = NULL,
1876 .isolate_pages = isolate_pages_global,
1877 .nodemask = nodemask, 1895 .nodemask = nodemask,
1878 }; 1896 };
1879 1897
@@ -1894,7 +1912,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1894 .swappiness = swappiness, 1912 .swappiness = swappiness,
1895 .order = 0, 1913 .order = 0,
1896 .mem_cgroup = mem, 1914 .mem_cgroup = mem,
1897 .isolate_pages = mem_cgroup_isolate_pages,
1898 }; 1915 };
1899 nodemask_t nm = nodemask_of_node(nid); 1916 nodemask_t nm = nodemask_of_node(nid);
1900 1917
@@ -1928,7 +1945,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1928 .swappiness = swappiness, 1945 .swappiness = swappiness,
1929 .order = 0, 1946 .order = 0,
1930 .mem_cgroup = mem_cont, 1947 .mem_cgroup = mem_cont,
1931 .isolate_pages = mem_cgroup_isolate_pages,
1932 .nodemask = NULL, /* we don't care the placement */ 1948 .nodemask = NULL, /* we don't care the placement */
1933 }; 1949 };
1934 1950
@@ -2006,7 +2022,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
2006 .swappiness = vm_swappiness, 2022 .swappiness = vm_swappiness,
2007 .order = order, 2023 .order = order,
2008 .mem_cgroup = NULL, 2024 .mem_cgroup = NULL,
2009 .isolate_pages = isolate_pages_global,
2010 }; 2025 };
2011 /* 2026 /*
2012 * temp_priority is used to remember the scanning priority at which 2027 * temp_priority is used to remember the scanning priority at which
@@ -2385,7 +2400,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2385 .hibernation_mode = 1, 2400 .hibernation_mode = 1,
2386 .swappiness = vm_swappiness, 2401 .swappiness = vm_swappiness,
2387 .order = 0, 2402 .order = 0,
2388 .isolate_pages = isolate_pages_global,
2389 }; 2403 };
2390 struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 2404 struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
2391 struct task_struct *p = current; 2405 struct task_struct *p = current;
@@ -2570,7 +2584,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2570 .gfp_mask = gfp_mask, 2584 .gfp_mask = gfp_mask,
2571 .swappiness = vm_swappiness, 2585 .swappiness = vm_swappiness,
2572 .order = order, 2586 .order = order,
2573 .isolate_pages = isolate_pages_global,
2574 }; 2587 };
2575 unsigned long slab_reclaimable; 2588 unsigned long slab_reclaimable;
2576 2589
diff --git a/mm/vmstat.c b/mm/vmstat.c
index fa12ea3051fb..7759941d4e77 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -16,6 +16,7 @@
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/vmstat.h> 17#include <linux/vmstat.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/math64.h>
19 20
20#ifdef CONFIG_VM_EVENT_COUNTERS 21#ifdef CONFIG_VM_EVENT_COUNTERS
21DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 22DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -379,7 +380,86 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
379} 380}
380#endif 381#endif
381 382
382#ifdef CONFIG_PROC_FS 383#ifdef CONFIG_COMPACTION
384struct contig_page_info {
385 unsigned long free_pages;
386 unsigned long free_blocks_total;
387 unsigned long free_blocks_suitable;
388};
389
390/*
391 * Calculate the number of free pages in a zone, how many contiguous
392 * pages are free and how many are large enough to satisfy an allocation of
393 * the target size. Note that this function makes no attempt to estimate
394 * how many suitable free blocks there *might* be if MOVABLE pages were
395 * migrated. Calculating that is possible, but expensive and can be
396 * figured out from userspace
397 */
398static void fill_contig_page_info(struct zone *zone,
399 unsigned int suitable_order,
400 struct contig_page_info *info)
401{
402 unsigned int order;
403
404 info->free_pages = 0;
405 info->free_blocks_total = 0;
406 info->free_blocks_suitable = 0;
407
408 for (order = 0; order < MAX_ORDER; order++) {
409 unsigned long blocks;
410
411 /* Count number of free blocks */
412 blocks = zone->free_area[order].nr_free;
413 info->free_blocks_total += blocks;
414
415 /* Count free base pages */
416 info->free_pages += blocks << order;
417
418 /* Count the suitable free blocks */
419 if (order >= suitable_order)
420 info->free_blocks_suitable += blocks <<
421 (order - suitable_order);
422 }
423}
424
425/*
426 * A fragmentation index only makes sense if an allocation of a requested
427 * size would fail. If that is true, the fragmentation index indicates
428 * whether external fragmentation or a lack of memory was the problem.
429 * The value can be used to determine if page reclaim or compaction
430 * should be used
431 */
432static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
433{
434 unsigned long requested = 1UL << order;
435
436 if (!info->free_blocks_total)
437 return 0;
438
439 /* Fragmentation index only makes sense when a request would fail */
440 if (info->free_blocks_suitable)
441 return -1000;
442
443 /*
444 * Index is between 0 and 1 so return within 3 decimal places
445 *
446 * 0 => allocation would fail due to lack of memory
447 * 1 => allocation would fail due to fragmentation
448 */
449 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
450}
451
452/* Same as __fragmentation index but allocs contig_page_info on stack */
453int fragmentation_index(struct zone *zone, unsigned int order)
454{
455 struct contig_page_info info;
456
457 fill_contig_page_info(zone, order, &info);
458 return __fragmentation_index(order, &info);
459}
460#endif
461
462#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION)
383#include <linux/proc_fs.h> 463#include <linux/proc_fs.h>
384#include <linux/seq_file.h> 464#include <linux/seq_file.h>
385 465
@@ -432,7 +512,9 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
432 spin_unlock_irqrestore(&zone->lock, flags); 512 spin_unlock_irqrestore(&zone->lock, flags);
433 } 513 }
434} 514}
515#endif
435 516
517#ifdef CONFIG_PROC_FS
436static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, 518static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
437 struct zone *zone) 519 struct zone *zone)
438{ 520{
@@ -693,6 +775,16 @@ static const char * const vmstat_text[] = {
693 "allocstall", 775 "allocstall",
694 776
695 "pgrotated", 777 "pgrotated",
778
779#ifdef CONFIG_COMPACTION
780 "compact_blocks_moved",
781 "compact_pages_moved",
782 "compact_pagemigrate_failed",
783 "compact_stall",
784 "compact_fail",
785 "compact_success",
786#endif
787
696#ifdef CONFIG_HUGETLB_PAGE 788#ifdef CONFIG_HUGETLB_PAGE
697 "htlb_buddy_alloc_success", 789 "htlb_buddy_alloc_success",
698 "htlb_buddy_alloc_fail", 790 "htlb_buddy_alloc_fail",
@@ -954,3 +1046,162 @@ static int __init setup_vmstat(void)
954 return 0; 1046 return 0;
955} 1047}
956module_init(setup_vmstat) 1048module_init(setup_vmstat)
1049
1050#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
1051#include <linux/debugfs.h>
1052
1053static struct dentry *extfrag_debug_root;
1054
1055/*
1056 * Return an index indicating how much of the available free memory is
1057 * unusable for an allocation of the requested size.
1058 */
1059static int unusable_free_index(unsigned int order,
1060 struct contig_page_info *info)
1061{
1062 /* No free memory is interpreted as all free memory is unusable */
1063 if (info->free_pages == 0)
1064 return 1000;
1065
1066 /*
1067 * Index should be a value between 0 and 1. Return a value to 3
1068 * decimal places.
1069 *
1070 * 0 => no fragmentation
1071 * 1 => high fragmentation
1072 */
1073 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
1074
1075}
1076
1077static void unusable_show_print(struct seq_file *m,
1078 pg_data_t *pgdat, struct zone *zone)
1079{
1080 unsigned int order;
1081 int index;
1082 struct contig_page_info info;
1083
1084 seq_printf(m, "Node %d, zone %8s ",
1085 pgdat->node_id,
1086 zone->name);
1087 for (order = 0; order < MAX_ORDER; ++order) {
1088 fill_contig_page_info(zone, order, &info);
1089 index = unusable_free_index(order, &info);
1090 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
1091 }
1092
1093 seq_putc(m, '\n');
1094}
1095
1096/*
1097 * Display unusable free space index
1098 *
1099 * The unusable free space index measures how much of the available free
1100 * memory cannot be used to satisfy an allocation of a given size and is a
1101 * value between 0 and 1. The higher the value, the more of free memory is
1102 * unusable and by implication, the worse the external fragmentation is. This
1103 * can be expressed as a percentage by multiplying by 100.
1104 */
1105static int unusable_show(struct seq_file *m, void *arg)
1106{
1107 pg_data_t *pgdat = (pg_data_t *)arg;
1108
1109 /* check memoryless node */
1110 if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
1111 return 0;
1112
1113 walk_zones_in_node(m, pgdat, unusable_show_print);
1114
1115 return 0;
1116}
1117
1118static const struct seq_operations unusable_op = {
1119 .start = frag_start,
1120 .next = frag_next,
1121 .stop = frag_stop,
1122 .show = unusable_show,
1123};
1124
1125static int unusable_open(struct inode *inode, struct file *file)
1126{
1127 return seq_open(file, &unusable_op);
1128}
1129
1130static const struct file_operations unusable_file_ops = {
1131 .open = unusable_open,
1132 .read = seq_read,
1133 .llseek = seq_lseek,
1134 .release = seq_release,
1135};
1136
1137static void extfrag_show_print(struct seq_file *m,
1138 pg_data_t *pgdat, struct zone *zone)
1139{
1140 unsigned int order;
1141 int index;
1142
1143 /* Alloc on stack as interrupts are disabled for zone walk */
1144 struct contig_page_info info;
1145
1146 seq_printf(m, "Node %d, zone %8s ",
1147 pgdat->node_id,
1148 zone->name);
1149 for (order = 0; order < MAX_ORDER; ++order) {
1150 fill_contig_page_info(zone, order, &info);
1151 index = __fragmentation_index(order, &info);
1152 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
1153 }
1154
1155 seq_putc(m, '\n');
1156}
1157
1158/*
1159 * Display fragmentation index for orders that allocations would fail for
1160 */
1161static int extfrag_show(struct seq_file *m, void *arg)
1162{
1163 pg_data_t *pgdat = (pg_data_t *)arg;
1164
1165 walk_zones_in_node(m, pgdat, extfrag_show_print);
1166
1167 return 0;
1168}
1169
1170static const struct seq_operations extfrag_op = {
1171 .start = frag_start,
1172 .next = frag_next,
1173 .stop = frag_stop,
1174 .show = extfrag_show,
1175};
1176
1177static int extfrag_open(struct inode *inode, struct file *file)
1178{
1179 return seq_open(file, &extfrag_op);
1180}
1181
1182static const struct file_operations extfrag_file_ops = {
1183 .open = extfrag_open,
1184 .read = seq_read,
1185 .llseek = seq_lseek,
1186 .release = seq_release,
1187};
1188
1189static int __init extfrag_debug_init(void)
1190{
1191 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
1192 if (!extfrag_debug_root)
1193 return -ENOMEM;
1194
1195 if (!debugfs_create_file("unusable_index", 0444,
1196 extfrag_debug_root, NULL, &unusable_file_ops))
1197 return -ENOMEM;
1198
1199 if (!debugfs_create_file("extfrag_index", 0444,
1200 extfrag_debug_root, NULL, &extfrag_file_ops))
1201 return -ENOMEM;
1202
1203 return 0;
1204}
1205
1206module_init(extfrag_debug_init);
1207#endif
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
index 77d3aab4036b..149f82160130 100644
--- a/net/9p/protocol.c
+++ b/net/9p/protocol.c
@@ -394,7 +394,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
394 const char *sptr = va_arg(ap, const char *); 394 const char *sptr = va_arg(ap, const char *);
395 int16_t len = 0; 395 int16_t len = 0;
396 if (sptr) 396 if (sptr)
397 len = MIN(strlen(sptr), USHORT_MAX); 397 len = MIN(strlen(sptr), USHRT_MAX);
398 398
399 errcode = p9pdu_writef(pdu, proto_version, 399 errcode = p9pdu_writef(pdu, proto_version,
400 "w", len); 400 "w", len);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 1b08cae9c65b..07395f861d35 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -296,7 +296,7 @@ static inline u8 dccp_ndp_len(const u64 ndp)
296{ 296{
297 if (likely(ndp <= 0xFF)) 297 if (likely(ndp <= 0xFF))
298 return 1; 298 return 1;
299 return likely(ndp <= USHORT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6); 299 return likely(ndp <= USHRT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6);
300} 300}
301 301
302int dccp_insert_option(struct sock *sk, struct sk_buff *skb, 302int dccp_insert_option(struct sock *sk, struct sk_buff *skb,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9de6a698f91d..baeec29fe0f1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1686,8 +1686,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1686 return -ENOPROTOOPT; 1686 return -ENOPROTOOPT;
1687 if (val != 0 && val < 8) /* Illegal coverage: use default (8) */ 1687 if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
1688 val = 8; 1688 val = 8;
1689 else if (val > USHORT_MAX) 1689 else if (val > USHRT_MAX)
1690 val = USHORT_MAX; 1690 val = USHRT_MAX;
1691 up->pcslen = val; 1691 up->pcslen = val;
1692 up->pcflag |= UDPLITE_SEND_CC; 1692 up->pcflag |= UDPLITE_SEND_CC;
1693 break; 1693 break;
@@ -1700,8 +1700,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1700 return -ENOPROTOOPT; 1700 return -ENOPROTOOPT;
1701 if (val != 0 && val < 8) /* Avoid silly minimal values. */ 1701 if (val != 0 && val < 8) /* Avoid silly minimal values. */
1702 val = 8; 1702 val = 8;
1703 else if (val > USHORT_MAX) 1703 else if (val > USHRT_MAX)
1704 val = USHORT_MAX; 1704 val = USHRT_MAX;
1705 up->pcrlen = val; 1705 up->pcrlen = val;
1706 up->pcflag |= UDPLITE_RECV_CC; 1706 up->pcflag |= UDPLITE_RECV_CC;
1707 break; 1707 break;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 730197591ab5..ba9360a475b0 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -259,7 +259,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
259 skb_queue_head_init(&sta->tx_filtered); 259 skb_queue_head_init(&sta->tx_filtered);
260 260
261 for (i = 0; i < NUM_RX_DATA_QUEUES; i++) 261 for (i = 0; i < NUM_RX_DATA_QUEUES; i++)
262 sta->last_seq_ctrl[i] = cpu_to_le16(USHORT_MAX); 262 sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX);
263 263
264#ifdef CONFIG_MAC80211_VERBOSE_DEBUG 264#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
265 printk(KERN_DEBUG "%s: Allocated STA %pM\n", 265 printk(KERN_DEBUG "%s: Allocated STA %pM\n",
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 121105355f60..dac219a56ae1 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -783,7 +783,7 @@ static int rpcb_dec_getport(struct rpc_rqst *req, __be32 *p,
783 port = ntohl(*p); 783 port = ntohl(*p);
784 dprintk("RPC: %5u PMAP_%s result: %lu\n", task->tk_pid, 784 dprintk("RPC: %5u PMAP_%s result: %lu\n", task->tk_pid,
785 task->tk_msg.rpc_proc->p_name, port); 785 task->tk_msg.rpc_proc->p_name, port);
786 if (unlikely(port > USHORT_MAX)) 786 if (unlikely(port > USHRT_MAX))
787 return -EIO; 787 return -EIO;
788 788
789 rpcb->r_port = port; 789 rpcb->r_port = port;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 3fc325399ee4..dcd0132396ba 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -166,7 +166,6 @@ EXPORT_SYMBOL_GPL(xprt_unregister_transport);
166int xprt_load_transport(const char *transport_name) 166int xprt_load_transport(const char *transport_name)
167{ 167{
168 struct xprt_class *t; 168 struct xprt_class *t;
169 char module_name[sizeof t->name + 5];
170 int result; 169 int result;
171 170
172 result = 0; 171 result = 0;
@@ -178,9 +177,7 @@ int xprt_load_transport(const char *transport_name)
178 } 177 }
179 } 178 }
180 spin_unlock(&xprt_list_lock); 179 spin_unlock(&xprt_list_lock);
181 strcpy(module_name, "xprt"); 180 result = request_module("xprt%s", transport_name);
182 strncat(module_name, transport_name, sizeof t->name);
183 result = request_module(module_name);
184out: 181out:
185 return result; 182 return result;
186} 183}
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index f2bbea900700..bd88f11b0953 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1382,6 +1382,21 @@ sub process {
1382 ERROR("trailing whitespace\n" . $herevet); 1382 ERROR("trailing whitespace\n" . $herevet);
1383 } 1383 }
1384 1384
1385# check for Kconfig help text having a real description
1386 if ($realfile =~ /Kconfig/ &&
1387 $line =~ /\+?\s*(---)?help(---)?$/) {
1388 my $length = 0;
1389 for (my $l = $linenr; defined($lines[$l]); $l++) {
1390 my $f = $lines[$l];
1391 $f =~ s/#.*//;
1392 $f =~ s/^\s+//;
1393 next if ($f =~ /^$/);
1394 last if ($f =~ /^\s*config\s/);
1395 $length++;
1396 }
1397 WARN("please write a paragraph that describes the config symbol fully\n" . $herecurr) if ($length < 4);
1398 }
1399
1385# check we are in a valid source file if not then ignore this hunk 1400# check we are in a valid source file if not then ignore this hunk
1386 next if ($realfile !~ /\.(h|c|s|S|pl|sh)$/); 1401 next if ($realfile !~ /\.(h|c|s|S|pl|sh)$/);
1387 1402
@@ -2586,6 +2601,11 @@ sub process {
2586 CHK("architecture specific defines should be avoided\n" . $herecurr); 2601 CHK("architecture specific defines should be avoided\n" . $herecurr);
2587 } 2602 }
2588 2603
2604# Check that the storage class is at the beginning of a declaration
2605 if ($line =~ /\b$Storage\b/ && $line !~ /^.\s*$Storage\b/) {
2606 WARN("storage class should be at the beginning of the declaration\n" . $herecurr)
2607 }
2608
2589# check the location of the inline attribute, that it is between 2609# check the location of the inline attribute, that it is between
2590# storage class and type. 2610# storage class and type.
2591 if ($line =~ /\b$Type\s+$Inline\b/ || 2611 if ($line =~ /\b$Type\s+$Inline\b/ ||
diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 6f97a13bcee4..b2281982f52f 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -13,7 +13,7 @@
13use strict; 13use strict;
14 14
15my $P = $0; 15my $P = $0;
16my $V = '0.23'; 16my $V = '0.24';
17 17
18use Getopt::Long qw(:config no_auto_abbrev); 18use Getopt::Long qw(:config no_auto_abbrev);
19 19
@@ -25,6 +25,7 @@ my $email_list = 1;
25my $email_subscriber_list = 0; 25my $email_subscriber_list = 0;
26my $email_git_penguin_chiefs = 0; 26my $email_git_penguin_chiefs = 0;
27my $email_git = 1; 27my $email_git = 1;
28my $email_git_all_signature_types = 0;
28my $email_git_blame = 0; 29my $email_git_blame = 0;
29my $email_git_min_signatures = 1; 30my $email_git_min_signatures = 1;
30my $email_git_max_maintainers = 5; 31my $email_git_max_maintainers = 5;
@@ -51,9 +52,9 @@ my $help = 0;
51my $exit = 0; 52my $exit = 0;
52 53
53my @penguin_chief = (); 54my @penguin_chief = ();
54push(@penguin_chief,"Linus Torvalds:torvalds\@linux-foundation.org"); 55push(@penguin_chief, "Linus Torvalds:torvalds\@linux-foundation.org");
55#Andrew wants in on most everything - 2009/01/14 56#Andrew wants in on most everything - 2009/01/14
56#push(@penguin_chief,"Andrew Morton:akpm\@linux-foundation.org"); 57#push(@penguin_chief, "Andrew Morton:akpm\@linux-foundation.org");
57 58
58my @penguin_chief_names = (); 59my @penguin_chief_names = ();
59foreach my $chief (@penguin_chief) { 60foreach my $chief (@penguin_chief) {
@@ -63,7 +64,16 @@ foreach my $chief (@penguin_chief) {
63 push(@penguin_chief_names, $chief_name); 64 push(@penguin_chief_names, $chief_name);
64 } 65 }
65} 66}
66my $penguin_chiefs = "\(" . join("|",@penguin_chief_names) . "\)"; 67my $penguin_chiefs = "\(" . join("|", @penguin_chief_names) . "\)";
68
69# Signature types of people who are either
70# a) responsible for the code in question, or
71# b) familiar enough with it to give relevant feedback
72my @signature_tags = ();
73push(@signature_tags, "Signed-off-by:");
74push(@signature_tags, "Reviewed-by:");
75push(@signature_tags, "Acked-by:");
76my $signaturePattern = "\(" . join("|", @signature_tags) . "\)";
67 77
68# rfc822 email address - preloaded methods go here. 78# rfc822 email address - preloaded methods go here.
69my $rfc822_lwsp = "(?:(?:\\r\\n)?[ \\t])"; 79my $rfc822_lwsp = "(?:(?:\\r\\n)?[ \\t])";
@@ -97,9 +107,34 @@ my %VCS_cmds_hg = (
97 "blame_commit_pattern" => "^([0-9a-f]+):" 107 "blame_commit_pattern" => "^([0-9a-f]+):"
98); 108);
99 109
110if (-f "${lk_path}.get_maintainer.conf") {
111 my @conf_args;
112 open(my $conffile, '<', "${lk_path}.get_maintainer.conf")
113 or warn "$P: Can't open .get_maintainer.conf: $!\n";
114 while (<$conffile>) {
115 my $line = $_;
116
117 $line =~ s/\s*\n?$//g;
118 $line =~ s/^\s*//g;
119 $line =~ s/\s+/ /g;
120
121 next if ($line =~ m/^\s*#/);
122 next if ($line =~ m/^\s*$/);
123
124 my @words = split(" ", $line);
125 foreach my $word (@words) {
126 last if ($word =~ m/^#/);
127 push (@conf_args, $word);
128 }
129 }
130 close($conffile);
131 unshift(@ARGV, @conf_args) if @conf_args;
132}
133
100if (!GetOptions( 134if (!GetOptions(
101 'email!' => \$email, 135 'email!' => \$email,
102 'git!' => \$email_git, 136 'git!' => \$email_git,
137 'git-all-signature-types!' => \$email_git_all_signature_types,
103 'git-blame!' => \$email_git_blame, 138 'git-blame!' => \$email_git_blame,
104 'git-chief-penguins!' => \$email_git_penguin_chiefs, 139 'git-chief-penguins!' => \$email_git_penguin_chiefs,
105 'git-min-signatures=i' => \$email_git_min_signatures, 140 'git-min-signatures=i' => \$email_git_min_signatures,
@@ -180,6 +215,10 @@ if (!top_of_kernel_tree($lk_path)) {
180 . "a linux kernel source tree.\n"; 215 . "a linux kernel source tree.\n";
181} 216}
182 217
218if ($email_git_all_signature_types) {
219 $signaturePattern = "(.+?)[Bb][Yy]:";
220}
221
183## Read MAINTAINERS for type/value pairs 222## Read MAINTAINERS for type/value pairs
184 223
185my @typevalue = (); 224my @typevalue = ();
@@ -497,13 +536,15 @@ version: $V
497MAINTAINER field selection options: 536MAINTAINER field selection options:
498 --email => print email address(es) if any 537 --email => print email address(es) if any
499 --git => include recent git \*-by: signers 538 --git => include recent git \*-by: signers
539 --git-all-signature-types => include signers regardless of signature type
540 or use only ${signaturePattern} signers (default: $email_git_all_signature_types)
500 --git-chief-penguins => include ${penguin_chiefs} 541 --git-chief-penguins => include ${penguin_chiefs}
501 --git-min-signatures => number of signatures required (default: 1) 542 --git-min-signatures => number of signatures required (default: $email_git_min_signatures)
502 --git-max-maintainers => maximum maintainers to add (default: 5) 543 --git-max-maintainers => maximum maintainers to add (default: $email_git_max_maintainers)
503 --git-min-percent => minimum percentage of commits required (default: 5) 544 --git-min-percent => minimum percentage of commits required (default: $email_git_min_percent)
504 --git-blame => use git blame to find modified commits for patch or file 545 --git-blame => use git blame to find modified commits for patch or file
505 --git-since => git history to use (default: 1-year-ago) 546 --git-since => git history to use (default: $email_git_since)
506 --hg-since => hg history to use (default: -365) 547 --hg-since => hg history to use (default: $email_hg_since)
507 --m => include maintainer(s) if any 548 --m => include maintainer(s) if any
508 --n => include name 'Full Name <addr\@domain.tld>' 549 --n => include name 'Full Name <addr\@domain.tld>'
509 --l => include list(s) if any 550 --l => include list(s) if any
@@ -556,6 +597,11 @@ Notes:
556 --git-min-signatures, --git-max-maintainers, --git-min-percent, and 597 --git-min-signatures, --git-max-maintainers, --git-min-percent, and
557 --git-blame 598 --git-blame
558 Use --hg-since not --git-since to control date selection 599 Use --hg-since not --git-since to control date selection
600 File ".get_maintainer.conf", if it exists in the linux kernel source root
601 directory, can change whatever get_maintainer defaults are desired.
602 Entries in this file can be any command line argument.
603 This file is prepended to any additional command line arguments.
604 Multiple lines and # comments are allowed.
559EOT 605EOT
560} 606}
561 607
@@ -964,7 +1010,7 @@ sub vcs_find_signers {
964 1010
965 $commits = grep(/$pattern/, @lines); # of commits 1011 $commits = grep(/$pattern/, @lines); # of commits
966 1012
967 @lines = grep(/^[-_ a-z]+by:.*\@.*$/i, @lines); 1013 @lines = grep(/^[ \t]*${signaturePattern}.*\@.*$/, @lines);
968 if (!$email_git_penguin_chiefs) { 1014 if (!$email_git_penguin_chiefs) {
969 @lines = grep(!/${penguin_chiefs}/i, @lines); 1015 @lines = grep(!/${penguin_chiefs}/i, @lines);
970 } 1016 }
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index ef03a82a0135..d37f713e73ce 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -669,7 +669,7 @@ static void keyring_unlink_rcu_disposal(struct rcu_head *rcu)
669 struct keyring_list *klist = 669 struct keyring_list *klist =
670 container_of(rcu, struct keyring_list, rcu); 670 container_of(rcu, struct keyring_list, rcu);
671 671
672 if (klist->delkey != USHORT_MAX) 672 if (klist->delkey != USHRT_MAX)
673 key_put(klist->keys[klist->delkey]); 673 key_put(klist->keys[klist->delkey]);
674 kfree(klist); 674 kfree(klist);
675} 675}
@@ -746,7 +746,7 @@ int __key_link_begin(struct key *keyring, const struct key_type *type,
746 max += klist->maxkeys; 746 max += klist->maxkeys;
747 747
748 ret = -ENFILE; 748 ret = -ENFILE;
749 if (max > USHORT_MAX - 1) 749 if (max > USHRT_MAX - 1)
750 goto error_quota; 750 goto error_quota;
751 size = sizeof(*klist) + sizeof(struct key *) * max; 751 size = sizeof(*klist) + sizeof(struct key *) * max;
752 if (size > PAGE_SIZE) 752 if (size > PAGE_SIZE)
@@ -763,7 +763,7 @@ int __key_link_begin(struct key *keyring, const struct key_type *type,
763 sizeof(struct key *) * klist->nkeys); 763 sizeof(struct key *) * klist->nkeys);
764 nklist->delkey = klist->nkeys; 764 nklist->delkey = klist->nkeys;
765 nklist->nkeys = klist->nkeys + 1; 765 nklist->nkeys = klist->nkeys + 1;
766 klist->delkey = USHORT_MAX; 766 klist->delkey = USHRT_MAX;
767 } else { 767 } else {
768 nklist->nkeys = 1; 768 nklist->nkeys = 1;
769 nklist->delkey = 0; 769 nklist->delkey = 0;