aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/vm/frontswap.txt278
-rw-r--r--MAINTAINERS7
-rw-r--r--arch/avr32/kernel/signal.c2
-rw-r--r--arch/xtensa/include/asm/syscall.h4
-rw-r--r--arch/xtensa/kernel/signal.c2
-rw-r--r--drivers/staging/ramster/zcache-main.c8
-rw-r--r--drivers/staging/zcache/zcache-main.c10
-rw-r--r--drivers/xen/tmem.c8
-rw-r--r--fs/cifs/cifsglob.h7
-rw-r--r--fs/cifs/cifsproto.h1
-rw-r--r--fs/cifs/cifssmb.c8
-rw-r--r--fs/cifs/connect.c8
-rw-r--r--fs/cifs/file.c106
-rw-r--r--fs/cifs/misc.c89
-rw-r--r--fs/cifs/smb1ops.c89
-rw-r--r--fs/cifs/transport.c2
-rw-r--r--include/linux/frontswap.h127
-rw-r--r--include/linux/swap.h4
-rw-r--r--include/linux/swapfile.h13
-rw-r--r--mm/Kconfig17
-rw-r--r--mm/Makefile1
-rw-r--r--mm/frontswap.c314
-rw-r--r--mm/page_io.c12
-rw-r--r--mm/swapfile.c54
24 files changed, 998 insertions, 173 deletions
diff --git a/Documentation/vm/frontswap.txt b/Documentation/vm/frontswap.txt
new file mode 100644
index 00000000000..37067cf455f
--- /dev/null
+++ b/Documentation/vm/frontswap.txt
@@ -0,0 +1,278 @@
1Frontswap provides a "transcendent memory" interface for swap pages.
2In some environments, dramatic performance savings may be obtained because
3swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk.
4
5(Note, frontswap -- and cleancache (merged at 3.0) -- are the "frontends"
6and the only necessary changes to the core kernel for transcendent memory;
7all other supporting code -- the "backends" -- is implemented as drivers.
8See the LWN.net article "Transcendent memory in a nutshell" for a detailed
9overview of frontswap and related kernel parts:
10https://lwn.net/Articles/454795/ )
11
12Frontswap is so named because it can be thought of as the opposite of
13a "backing" store for a swap device. The storage is assumed to be
14a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming
15to the requirements of transcendent memory (such as Xen's "tmem", or
16in-kernel compressed memory, aka "zcache", or future RAM-like devices);
17this pseudo-RAM device is not directly accessible or addressable by the
18kernel and is of unknown and possibly time-varying size. The driver
19links itself to frontswap by calling frontswap_register_ops to set the
20frontswap_ops funcs appropriately and the functions it provides must
21conform to certain policies as follows:
22
23An "init" prepares the device to receive frontswap pages associated
24with the specified swap device number (aka "type"). A "store" will
25copy the page to transcendent memory and associate it with the type and
26offset associated with the page. A "load" will copy the page, if found,
27from transcendent memory into kernel memory, but will NOT remove the page
28from from transcendent memory. An "invalidate_page" will remove the page
29from transcendent memory and an "invalidate_area" will remove ALL pages
30associated with the swap type (e.g., like swapoff) and notify the "device"
31to refuse further stores with that swap type.
32
33Once a page is successfully stored, a matching load on the page will normally
34succeed. So when the kernel finds itself in a situation where it needs
35to swap out a page, it first attempts to use frontswap. If the store returns
36success, the data has been successfully saved to transcendent memory and
37a disk write and, if the data is later read back, a disk read are avoided.
38If a store returns failure, transcendent memory has rejected the data, and the
39page can be written to swap as usual.
40
41If a backend chooses, frontswap can be configured as a "writethrough
42cache" by calling frontswap_writethrough(). In this mode, the reduction
43in swap device writes is lost (and also a non-trivial performance advantage)
44in order to allow the backend to arbitrarily "reclaim" space used to
45store frontswap pages to more completely manage its memory usage.
46
47Note that if a page is stored and the page already exists in transcendent memory
48(a "duplicate" store), either the store succeeds and the data is overwritten,
49or the store fails AND the page is invalidated. This ensures stale data may
50never be obtained from frontswap.
51
52If properly configured, monitoring of frontswap is done via debugfs in
53the /sys/kernel/debug/frontswap directory. The effectiveness of
54frontswap can be measured (across all swap devices) with:
55
56failed_stores - how many store attempts have failed
57loads - how many loads were attempted (all should succeed)
58succ_stores - how many store attempts have succeeded
59invalidates - how many invalidates were attempted
60
61A backend implementation may provide additional metrics.
62
63FAQ
64
651) Where's the value?
66
67When a workload starts swapping, performance falls through the floor.
68Frontswap significantly increases performance in many such workloads by
69providing a clean, dynamic interface to read and write swap pages to
70"transcendent memory" that is otherwise not directly addressable to the kernel.
71This interface is ideal when data is transformed to a different form
72and size (such as with compression) or secretly moved (as might be
73useful for write-balancing for some RAM-like devices). Swap pages (and
74evicted page-cache pages) are a great use for this kind of slower-than-RAM-
75but-much-faster-than-disk "pseudo-RAM device" and the frontswap (and
76cleancache) interface to transcendent memory provides a nice way to read
77and write -- and indirectly "name" -- the pages.
78
79Frontswap -- and cleancache -- with a fairly small impact on the kernel,
80provides a huge amount of flexibility for more dynamic, flexible RAM
81utilization in various system configurations:
82
83In the single kernel case, aka "zcache", pages are compressed and
84stored in local memory, thus increasing the total anonymous pages
85that can be safely kept in RAM. Zcache essentially trades off CPU
86cycles used in compression/decompression for better memory utilization.
87Benchmarks have shown little or no impact when memory pressure is
88low while providing a significant performance improvement (25%+)
89on some workloads under high memory pressure.
90
91"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory
92support for clustered systems. Frontswap pages are locally compressed
93as in zcache, but then "remotified" to another system's RAM. This
94allows RAM to be dynamically load-balanced back-and-forth as needed,
95i.e. when system A is overcommitted, it can swap to system B, and
96vice versa. RAMster can also be configured as a memory server so
97many servers in a cluster can swap, dynamically as needed, to a single
98server configured with a large amount of RAM... without pre-configuring
99how much of the RAM is available for each of the clients!
100
101In the virtual case, the whole point of virtualization is to statistically
102multiplex physical resources acrosst the varying demands of multiple
103virtual machines. This is really hard to do with RAM and efforts to do
104it well with no kernel changes have essentially failed (except in some
105well-publicized special-case workloads).
106Specifically, the Xen Transcendent Memory backend allows otherwise
107"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
108virtual machines, but the pages can be compressed and deduplicated to
109optimize RAM utilization. And when guest OS's are induced to surrender
110underutilized RAM (e.g. with "selfballooning"), sudden unexpected
111memory pressure may result in swapping; frontswap allows those pages
112to be swapped to and from hypervisor RAM (if overall host system memory
113conditions allow), thus mitigating the potentially awful performance impact
114of unplanned swapping.
115
116A KVM implementation is underway and has been RFC'ed to lkml. And,
117using frontswap, investigation is also underway on the use of NVM as
118a memory extension technology.
119
1202) Sure there may be performance advantages in some situations, but
121 what's the space/time overhead of frontswap?
122
123If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into
124nothingness and the only overhead is a few extra bytes per swapon'ed
125swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend"
126registers, there is one extra global variable compared to zero for
127every swap page read or written. If CONFIG_FRONTSWAP is enabled
128AND a frontswap backend registers AND the backend fails every "store"
129request (i.e. provides no memory despite claiming it might),
130CPU overhead is still negligible -- and since every frontswap fail
131precedes a swap page write-to-disk, the system is highly likely
132to be I/O bound and using a small fraction of a percent of a CPU
133will be irrelevant anyway.
134
135As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend
136registers, one bit is allocated for every swap page for every swap
137device that is swapon'd. This is added to the EIGHT bits (which
138was sixteen until about 2.6.34) that the kernel already allocates
139for every swap page for every swap device that is swapon'd. (Hugh
140Dickins has observed that frontswap could probably steal one of
141the existing eight bits, but let's worry about that minor optimization
142later.) For very large swap disks (which are rare) on a standard
1434K pagesize, this is 1MB per 32GB swap.
144
145When swap pages are stored in transcendent memory instead of written
146out to disk, there is a side effect that this may create more memory
147pressure that can potentially outweigh the other advantages. A
148backend, such as zcache, must implement policies to carefully (but
149dynamically) manage memory limits to ensure this doesn't happen.
150
1513) OK, how about a quick overview of what this frontswap patch does
152 in terms that a kernel hacker can grok?
153
154Let's assume that a frontswap "backend" has registered during
155kernel initialization; this registration indicates that this
156frontswap backend has access to some "memory" that is not directly
157accessible by the kernel. Exactly how much memory it provides is
158entirely dynamic and random.
159
160Whenever a swap-device is swapon'd frontswap_init() is called,
161passing the swap device number (aka "type") as a parameter.
162This notifies frontswap to expect attempts to "store" swap pages
163associated with that number.
164
165Whenever the swap subsystem is readying a page to write to a swap
166device (c.f swap_writepage()), frontswap_store is called. Frontswap
167consults with the frontswap backend and if the backend says it does NOT
168have room, frontswap_store returns -1 and the kernel swaps the page
169to the swap device as normal. Note that the response from the frontswap
170backend is unpredictable to the kernel; it may choose to never accept a
171page, it could accept every ninth page, or it might accept every
172page. But if the backend does accept a page, the data from the page
173has already been copied and associated with the type and offset,
174and the backend guarantees the persistence of the data. In this case,
175frontswap sets a bit in the "frontswap_map" for the swap device
176corresponding to the page offset on the swap device to which it would
177otherwise have written the data.
178
179When the swap subsystem needs to swap-in a page (swap_readpage()),
180it first calls frontswap_load() which checks the frontswap_map to
181see if the page was earlier accepted by the frontswap backend. If
182it was, the page of data is filled from the frontswap backend and
183the swap-in is complete. If not, the normal swap-in code is
184executed to obtain the page of data from the real swap device.
185
186So every time the frontswap backend accepts a page, a swap device read
187and (potentially) a swap device write are replaced by a "frontswap backend
188store" and (possibly) a "frontswap backend loads", which are presumably much
189faster.
190
1914) Can't frontswap be configured as a "special" swap device that is
192 just higher priority than any real swap device (e.g. like zswap,
193 or maybe swap-over-nbd/NFS)?
194
195No. First, the existing swap subsystem doesn't allow for any kind of
196swap hierarchy. Perhaps it could be rewritten to accomodate a hierarchy,
197but this would require fairly drastic changes. Even if it were
198rewritten, the existing swap subsystem uses the block I/O layer which
199assumes a swap device is fixed size and any page in it is linearly
200addressable. Frontswap barely touches the existing swap subsystem,
201and works around the constraints of the block I/O subsystem to provide
202a great deal of flexibility and dynamicity.
203
204For example, the acceptance of any swap page by the frontswap backend is
205entirely unpredictable. This is critical to the definition of frontswap
206backends because it grants completely dynamic discretion to the
207backend. In zcache, one cannot know a priori how compressible a page is.
208"Poorly" compressible pages can be rejected, and "poorly" can itself be
209defined dynamically depending on current memory constraints.
210
211Further, frontswap is entirely synchronous whereas a real swap
212device is, by definition, asynchronous and uses block I/O. The
213block I/O layer is not only unnecessary, but may perform "optimizations"
214that are inappropriate for a RAM-oriented device including delaying
215the write of some pages for a significant amount of time. Synchrony is
216required to ensure the dynamicity of the backend and to avoid thorny race
217conditions that would unnecessarily and greatly complicate frontswap
218and/or the block I/O subsystem. That said, only the initial "store"
219and "load" operations need be synchronous. A separate asynchronous thread
220is free to manipulate the pages stored by frontswap. For example,
221the "remotification" thread in RAMster uses standard asynchronous
222kernel sockets to move compressed frontswap pages to a remote machine.
223Similarly, a KVM guest-side implementation could do in-guest compression
224and use "batched" hypercalls.
225
226In a virtualized environment, the dynamicity allows the hypervisor
227(or host OS) to do "intelligent overcommit". For example, it can
228choose to accept pages only until host-swapping might be imminent,
229then force guests to do their own swapping.
230
231There is a downside to the transcendent memory specifications for
232frontswap: Since any "store" might fail, there must always be a real
233slot on a real swap device to swap the page. Thus frontswap must be
234implemented as a "shadow" to every swapon'd device with the potential
235capability of holding every page that the swap device might have held
236and the possibility that it might hold no pages at all. This means
237that frontswap cannot contain more pages than the total of swapon'd
238swap devices. For example, if NO swap device is configured on some
239installation, frontswap is useless. Swapless portable devices
240can still use frontswap but a backend for such devices must configure
241some kind of "ghost" swap device and ensure that it is never used.
242
2435) Why this weird definition about "duplicate stores"? If a page
244 has been previously successfully stored, can't it always be
245 successfully overwritten?
246
247Nearly always it can, but no, sometimes it cannot. Consider an example
248where data is compressed and the original 4K page has been compressed
249to 1K. Now an attempt is made to overwrite the page with data that
250is non-compressible and so would take the entire 4K. But the backend
251has no more space. In this case, the store must be rejected. Whenever
252frontswap rejects a store that would overwrite, it also must invalidate
253the old data and ensure that it is no longer accessible. Since the
254swap subsystem then writes the new data to the read swap device,
255this is the correct course of action to ensure coherency.
256
2576) What is frontswap_shrink for?
258
259When the (non-frontswap) swap subsystem swaps out a page to a real
260swap device, that page is only taking up low-value pre-allocated disk
261space. But if frontswap has placed a page in transcendent memory, that
262page may be taking up valuable real estate. The frontswap_shrink
263routine allows code outside of the swap subsystem to force pages out
264of the memory managed by frontswap and back into kernel-addressable memory.
265For example, in RAMster, a "suction driver" thread will attempt
266to "repatriate" pages sent to a remote machine back to the local machine;
267this is driven using the frontswap_shrink mechanism when memory pressure
268subsides.
269
2707) Why does the frontswap patch create the new include file swapfile.h?
271
272The frontswap code depends on some swap-subsystem-internal data
273structures that have, over the years, moved back and forth between
274static and global. This seemed a reasonable compromise: Define
275them as global but declare them in a new include file that isn't
276included by the large number of source files that include swap.h.
277
278Dan Magenheimer, last updated April 9, 2012
diff --git a/MAINTAINERS b/MAINTAINERS
index 55f0fda602e..6a52bb4a4fc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2930,6 +2930,13 @@ F: Documentation/power/freezing-of-tasks.txt
2930F: include/linux/freezer.h 2930F: include/linux/freezer.h
2931F: kernel/freezer.c 2931F: kernel/freezer.c
2932 2932
2933FRONTSWAP API
2934M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2935L: linux-kernel@vger.kernel.org
2936S: Maintained
2937F: mm/frontswap.c
2938F: include/linux/frontswap.h
2939
2933FS-CACHE: LOCAL CACHING FOR NETWORK FILESYSTEMS 2940FS-CACHE: LOCAL CACHING FOR NETWORK FILESYSTEMS
2934M: David Howells <dhowells@redhat.com> 2941M: David Howells <dhowells@redhat.com>
2935L: linux-cachefs@redhat.com 2942L: linux-cachefs@redhat.com
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index c140f9b41dc..d552a854dac 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -300,7 +300,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, struct thread_info *ti)
300 if ((sysreg_read(SR) & MODE_MASK) == MODE_SUPERVISOR) 300 if ((sysreg_read(SR) & MODE_MASK) == MODE_SUPERVISOR)
301 syscall = 1; 301 syscall = 1;
302 302
303 if (ti->flags & _TIF_SIGPENDING)) 303 if (ti->flags & _TIF_SIGPENDING)
304 do_signal(regs, syscall); 304 do_signal(regs, syscall);
305 305
306 if (ti->flags & _TIF_NOTIFY_RESUME) { 306 if (ti->flags & _TIF_NOTIFY_RESUME) {
diff --git a/arch/xtensa/include/asm/syscall.h b/arch/xtensa/include/asm/syscall.h
index 0b9f2e13c78..c1dacca312f 100644
--- a/arch/xtensa/include/asm/syscall.h
+++ b/arch/xtensa/include/asm/syscall.h
@@ -31,5 +31,5 @@ asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
31asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, 31asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
32 struct timespec __user *tsp, const sigset_t __user *sigmask, 32 struct timespec __user *tsp, const sigset_t __user *sigmask,
33 size_t sigsetsize); 33 size_t sigsetsize);
34 34asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset,
35 35 size_t sigsetsize);
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index b9f8e5850d3..efe4e854b3c 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -493,7 +493,7 @@ static void do_signal(struct pt_regs *regs)
493 if (ret) 493 if (ret)
494 return; 494 return;
495 495
496 signal_delivered(signr, info, ka, regs, 0); 496 signal_delivered(signr, &info, &ka, regs, 0);
497 if (current->ptrace & PT_SINGLESTEP) 497 if (current->ptrace & PT_SINGLESTEP)
498 task_pt_regs(current)->icountlevel = 1; 498 task_pt_regs(current)->icountlevel = 1;
499 499
diff --git a/drivers/staging/ramster/zcache-main.c b/drivers/staging/ramster/zcache-main.c
index 4e7ef0e6b79..d46764b5aab 100644
--- a/drivers/staging/ramster/zcache-main.c
+++ b/drivers/staging/ramster/zcache-main.c
@@ -3002,7 +3002,7 @@ static inline struct tmem_oid oswiz(unsigned type, u32 ind)
3002 return oid; 3002 return oid;
3003} 3003}
3004 3004
3005static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, 3005static int zcache_frontswap_store(unsigned type, pgoff_t offset,
3006 struct page *page) 3006 struct page *page)
3007{ 3007{
3008 u64 ind64 = (u64)offset; 3008 u64 ind64 = (u64)offset;
@@ -3025,7 +3025,7 @@ static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
3025 3025
3026/* returns 0 if the page was successfully gotten from frontswap, -1 if 3026/* returns 0 if the page was successfully gotten from frontswap, -1 if
3027 * was not present (should never happen!) */ 3027 * was not present (should never happen!) */
3028static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, 3028static int zcache_frontswap_load(unsigned type, pgoff_t offset,
3029 struct page *page) 3029 struct page *page)
3030{ 3030{
3031 u64 ind64 = (u64)offset; 3031 u64 ind64 = (u64)offset;
@@ -3080,8 +3080,8 @@ static void zcache_frontswap_init(unsigned ignored)
3080} 3080}
3081 3081
3082static struct frontswap_ops zcache_frontswap_ops = { 3082static struct frontswap_ops zcache_frontswap_ops = {
3083 .put_page = zcache_frontswap_put_page, 3083 .store = zcache_frontswap_store,
3084 .get_page = zcache_frontswap_get_page, 3084 .load = zcache_frontswap_load,
3085 .invalidate_page = zcache_frontswap_flush_page, 3085 .invalidate_page = zcache_frontswap_flush_page,
3086 .invalidate_area = zcache_frontswap_flush_area, 3086 .invalidate_area = zcache_frontswap_flush_area,
3087 .init = zcache_frontswap_init 3087 .init = zcache_frontswap_init
diff --git a/drivers/staging/zcache/zcache-main.c b/drivers/staging/zcache/zcache-main.c
index 2734dacacba..784c796b984 100644
--- a/drivers/staging/zcache/zcache-main.c
+++ b/drivers/staging/zcache/zcache-main.c
@@ -1835,7 +1835,7 @@ static int zcache_frontswap_poolid = -1;
1835 * Swizzling increases objects per swaptype, increasing tmem concurrency 1835 * Swizzling increases objects per swaptype, increasing tmem concurrency
1836 * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS 1836 * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS
1837 * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from 1837 * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from
1838 * frontswap_get_page(), but has side-effects. Hence using 8. 1838 * frontswap_load(), but has side-effects. Hence using 8.
1839 */ 1839 */
1840#define SWIZ_BITS 8 1840#define SWIZ_BITS 8
1841#define SWIZ_MASK ((1 << SWIZ_BITS) - 1) 1841#define SWIZ_MASK ((1 << SWIZ_BITS) - 1)
@@ -1849,7 +1849,7 @@ static inline struct tmem_oid oswiz(unsigned type, u32 ind)
1849 return oid; 1849 return oid;
1850} 1850}
1851 1851
1852static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, 1852static int zcache_frontswap_store(unsigned type, pgoff_t offset,
1853 struct page *page) 1853 struct page *page)
1854{ 1854{
1855 u64 ind64 = (u64)offset; 1855 u64 ind64 = (u64)offset;
@@ -1870,7 +1870,7 @@ static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
1870 1870
1871/* returns 0 if the page was successfully gotten from frontswap, -1 if 1871/* returns 0 if the page was successfully gotten from frontswap, -1 if
1872 * was not present (should never happen!) */ 1872 * was not present (should never happen!) */
1873static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, 1873static int zcache_frontswap_load(unsigned type, pgoff_t offset,
1874 struct page *page) 1874 struct page *page)
1875{ 1875{
1876 u64 ind64 = (u64)offset; 1876 u64 ind64 = (u64)offset;
@@ -1919,8 +1919,8 @@ static void zcache_frontswap_init(unsigned ignored)
1919} 1919}
1920 1920
1921static struct frontswap_ops zcache_frontswap_ops = { 1921static struct frontswap_ops zcache_frontswap_ops = {
1922 .put_page = zcache_frontswap_put_page, 1922 .store = zcache_frontswap_store,
1923 .get_page = zcache_frontswap_get_page, 1923 .load = zcache_frontswap_load,
1924 .invalidate_page = zcache_frontswap_flush_page, 1924 .invalidate_page = zcache_frontswap_flush_page,
1925 .invalidate_area = zcache_frontswap_flush_area, 1925 .invalidate_area = zcache_frontswap_flush_area,
1926 .init = zcache_frontswap_init 1926 .init = zcache_frontswap_init
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
index dcb79521e6c..89f264c6742 100644
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -269,7 +269,7 @@ static inline struct tmem_oid oswiz(unsigned type, u32 ind)
269} 269}
270 270
271/* returns 0 if the page was successfully put into frontswap, -1 if not */ 271/* returns 0 if the page was successfully put into frontswap, -1 if not */
272static int tmem_frontswap_put_page(unsigned type, pgoff_t offset, 272static int tmem_frontswap_store(unsigned type, pgoff_t offset,
273 struct page *page) 273 struct page *page)
274{ 274{
275 u64 ind64 = (u64)offset; 275 u64 ind64 = (u64)offset;
@@ -295,7 +295,7 @@ static int tmem_frontswap_put_page(unsigned type, pgoff_t offset,
295 * returns 0 if the page was successfully gotten from frontswap, -1 if 295 * returns 0 if the page was successfully gotten from frontswap, -1 if
296 * was not present (should never happen!) 296 * was not present (should never happen!)
297 */ 297 */
298static int tmem_frontswap_get_page(unsigned type, pgoff_t offset, 298static int tmem_frontswap_load(unsigned type, pgoff_t offset,
299 struct page *page) 299 struct page *page)
300{ 300{
301 u64 ind64 = (u64)offset; 301 u64 ind64 = (u64)offset;
@@ -362,8 +362,8 @@ static int __init no_frontswap(char *s)
362__setup("nofrontswap", no_frontswap); 362__setup("nofrontswap", no_frontswap);
363 363
364static struct frontswap_ops __initdata tmem_frontswap_ops = { 364static struct frontswap_ops __initdata tmem_frontswap_ops = {
365 .put_page = tmem_frontswap_put_page, 365 .store = tmem_frontswap_store,
366 .get_page = tmem_frontswap_get_page, 366 .load = tmem_frontswap_load,
367 .invalidate_page = tmem_frontswap_flush_page, 367 .invalidate_page = tmem_frontswap_flush_page,
368 .invalidate_area = tmem_frontswap_flush_area, 368 .invalidate_area = tmem_frontswap_flush_area,
369 .init = tmem_frontswap_init 369 .init = tmem_frontswap_init
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 20350a93ed9..6df0cbe1cbc 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -174,6 +174,7 @@ struct smb_version_operations {
174 void (*add_credits)(struct TCP_Server_Info *, const unsigned int); 174 void (*add_credits)(struct TCP_Server_Info *, const unsigned int);
175 void (*set_credits)(struct TCP_Server_Info *, const int); 175 void (*set_credits)(struct TCP_Server_Info *, const int);
176 int * (*get_credits_field)(struct TCP_Server_Info *); 176 int * (*get_credits_field)(struct TCP_Server_Info *);
177 __u64 (*get_next_mid)(struct TCP_Server_Info *);
177 /* data offset from read response message */ 178 /* data offset from read response message */
178 unsigned int (*read_data_offset)(char *); 179 unsigned int (*read_data_offset)(char *);
179 /* data length from read response message */ 180 /* data length from read response message */
@@ -399,6 +400,12 @@ set_credits(struct TCP_Server_Info *server, const int val)
399 server->ops->set_credits(server, val); 400 server->ops->set_credits(server, val);
400} 401}
401 402
403static inline __u64
404get_next_mid(struct TCP_Server_Info *server)
405{
406 return server->ops->get_next_mid(server);
407}
408
402/* 409/*
403 * Macros to allow the TCP_Server_Info->net field and related code to drop out 410 * Macros to allow the TCP_Server_Info->net field and related code to drop out
404 * when CONFIG_NET_NS isn't set. 411 * when CONFIG_NET_NS isn't set.
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 5ec21ecf798..0a6cbfe2761 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -114,7 +114,6 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
114 void **request_buf); 114 void **request_buf);
115extern int CIFS_SessSetup(unsigned int xid, struct cifs_ses *ses, 115extern int CIFS_SessSetup(unsigned int xid, struct cifs_ses *ses,
116 const struct nls_table *nls_cp); 116 const struct nls_table *nls_cp);
117extern __u64 GetNextMid(struct TCP_Server_Info *server);
118extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); 117extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
119extern u64 cifs_UnixTimeToNT(struct timespec); 118extern u64 cifs_UnixTimeToNT(struct timespec);
120extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, 119extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index b5ad716b264..5b400730c21 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -268,7 +268,7 @@ small_smb_init_no_tc(const int smb_command, const int wct,
268 return rc; 268 return rc;
269 269
270 buffer = (struct smb_hdr *)*request_buf; 270 buffer = (struct smb_hdr *)*request_buf;
271 buffer->Mid = GetNextMid(ses->server); 271 buffer->Mid = get_next_mid(ses->server);
272 if (ses->capabilities & CAP_UNICODE) 272 if (ses->capabilities & CAP_UNICODE)
273 buffer->Flags2 |= SMBFLG2_UNICODE; 273 buffer->Flags2 |= SMBFLG2_UNICODE;
274 if (ses->capabilities & CAP_STATUS32) 274 if (ses->capabilities & CAP_STATUS32)
@@ -402,7 +402,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
402 402
403 cFYI(1, "secFlags 0x%x", secFlags); 403 cFYI(1, "secFlags 0x%x", secFlags);
404 404
405 pSMB->hdr.Mid = GetNextMid(server); 405 pSMB->hdr.Mid = get_next_mid(server);
406 pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS); 406 pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
407 407
408 if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) 408 if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
@@ -782,7 +782,7 @@ CIFSSMBLogoff(const int xid, struct cifs_ses *ses)
782 return rc; 782 return rc;
783 } 783 }
784 784
785 pSMB->hdr.Mid = GetNextMid(ses->server); 785 pSMB->hdr.Mid = get_next_mid(ses->server);
786 786
787 if (ses->server->sec_mode & 787 if (ses->server->sec_mode &
788 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 788 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
@@ -4762,7 +4762,7 @@ getDFSRetry:
4762 4762
4763 /* server pointer checked in called function, 4763 /* server pointer checked in called function,
4764 but should never be null here anyway */ 4764 but should never be null here anyway */
4765 pSMB->hdr.Mid = GetNextMid(ses->server); 4765 pSMB->hdr.Mid = get_next_mid(ses->server);
4766 pSMB->hdr.Tid = ses->ipc_tid; 4766 pSMB->hdr.Tid = ses->ipc_tid;
4767 pSMB->hdr.Uid = ses->Suid; 4767 pSMB->hdr.Uid = ses->Suid;
4768 if (ses->capabilities & CAP_STATUS32) 4768 if (ses->capabilities & CAP_STATUS32)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index ccafdedd0db..78db68a5cf4 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1058,13 +1058,15 @@ cifs_demultiplex_thread(void *p)
1058 if (mid_entry != NULL) { 1058 if (mid_entry != NULL) {
1059 if (!mid_entry->multiRsp || mid_entry->multiEnd) 1059 if (!mid_entry->multiRsp || mid_entry->multiEnd)
1060 mid_entry->callback(mid_entry); 1060 mid_entry->callback(mid_entry);
1061 } else if (!server->ops->is_oplock_break(buf, server)) { 1061 } else if (!server->ops->is_oplock_break ||
1062 !server->ops->is_oplock_break(buf, server)) {
1062 cERROR(1, "No task to wake, unknown frame received! " 1063 cERROR(1, "No task to wake, unknown frame received! "
1063 "NumMids %d", atomic_read(&midCount)); 1064 "NumMids %d", atomic_read(&midCount));
1064 cifs_dump_mem("Received Data is: ", buf, 1065 cifs_dump_mem("Received Data is: ", buf,
1065 HEADER_SIZE(server)); 1066 HEADER_SIZE(server));
1066#ifdef CONFIG_CIFS_DEBUG2 1067#ifdef CONFIG_CIFS_DEBUG2
1067 server->ops->dump_detail(buf); 1068 if (server->ops->dump_detail)
1069 server->ops->dump_detail(buf);
1068 cifs_dump_mids(server); 1070 cifs_dump_mids(server);
1069#endif /* CIFS_DEBUG2 */ 1071#endif /* CIFS_DEBUG2 */
1070 1072
@@ -3938,7 +3940,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
3938 header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX, 3940 header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX,
3939 NULL /*no tid */ , 4 /*wct */ ); 3941 NULL /*no tid */ , 4 /*wct */ );
3940 3942
3941 smb_buffer->Mid = GetNextMid(ses->server); 3943 smb_buffer->Mid = get_next_mid(ses->server);
3942 smb_buffer->Uid = ses->Suid; 3944 smb_buffer->Uid = ses->Suid;
3943 pSMB = (TCONX_REQ *) smb_buffer; 3945 pSMB = (TCONX_REQ *) smb_buffer;
3944 pSMBr = (TCONX_RSP *) smb_buffer_response; 3946 pSMBr = (TCONX_RSP *) smb_buffer_response;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 253170dfa71..513adbc211d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -876,7 +876,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
876 struct cifsLockInfo *li, *tmp; 876 struct cifsLockInfo *li, *tmp;
877 struct cifs_tcon *tcon; 877 struct cifs_tcon *tcon;
878 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); 878 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
879 unsigned int num, max_num; 879 unsigned int num, max_num, max_buf;
880 LOCKING_ANDX_RANGE *buf, *cur; 880 LOCKING_ANDX_RANGE *buf, *cur;
881 int types[] = {LOCKING_ANDX_LARGE_FILES, 881 int types[] = {LOCKING_ANDX_LARGE_FILES,
882 LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; 882 LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
@@ -892,8 +892,19 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
892 return rc; 892 return rc;
893 } 893 }
894 894
895 max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) / 895 /*
896 sizeof(LOCKING_ANDX_RANGE); 896 * Accessing maxBuf is racy with cifs_reconnect - need to store value
897 * and check it for zero before using.
898 */
899 max_buf = tcon->ses->server->maxBuf;
900 if (!max_buf) {
901 mutex_unlock(&cinode->lock_mutex);
902 FreeXid(xid);
903 return -EINVAL;
904 }
905
906 max_num = (max_buf - sizeof(struct smb_hdr)) /
907 sizeof(LOCKING_ANDX_RANGE);
897 buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); 908 buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
898 if (!buf) { 909 if (!buf) {
899 mutex_unlock(&cinode->lock_mutex); 910 mutex_unlock(&cinode->lock_mutex);
@@ -1218,7 +1229,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
1218 int types[] = {LOCKING_ANDX_LARGE_FILES, 1229 int types[] = {LOCKING_ANDX_LARGE_FILES,
1219 LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; 1230 LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
1220 unsigned int i; 1231 unsigned int i;
1221 unsigned int max_num, num; 1232 unsigned int max_num, num, max_buf;
1222 LOCKING_ANDX_RANGE *buf, *cur; 1233 LOCKING_ANDX_RANGE *buf, *cur;
1223 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 1234 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1224 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); 1235 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
@@ -1228,8 +1239,16 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
1228 1239
1229 INIT_LIST_HEAD(&tmp_llist); 1240 INIT_LIST_HEAD(&tmp_llist);
1230 1241
1231 max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) / 1242 /*
1232 sizeof(LOCKING_ANDX_RANGE); 1243 * Accessing maxBuf is racy with cifs_reconnect - need to store value
1244 * and check it for zero before using.
1245 */
1246 max_buf = tcon->ses->server->maxBuf;
1247 if (!max_buf)
1248 return -EINVAL;
1249
1250 max_num = (max_buf - sizeof(struct smb_hdr)) /
1251 sizeof(LOCKING_ANDX_RANGE);
1233 buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); 1252 buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
1234 if (!buf) 1253 if (!buf)
1235 return -ENOMEM; 1254 return -ENOMEM;
@@ -1247,46 +1266,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
1247 continue; 1266 continue;
1248 if (types[i] != li->type) 1267 if (types[i] != li->type)
1249 continue; 1268 continue;
1250 if (!cinode->can_cache_brlcks) { 1269 if (cinode->can_cache_brlcks) {
1251 cur->Pid = cpu_to_le16(li->pid);
1252 cur->LengthLow = cpu_to_le32((u32)li->length);
1253 cur->LengthHigh =
1254 cpu_to_le32((u32)(li->length>>32));
1255 cur->OffsetLow = cpu_to_le32((u32)li->offset);
1256 cur->OffsetHigh =
1257 cpu_to_le32((u32)(li->offset>>32));
1258 /*
1259 * We need to save a lock here to let us add
1260 * it again to the file's list if the unlock
1261 * range request fails on the server.
1262 */
1263 list_move(&li->llist, &tmp_llist);
1264 if (++num == max_num) {
1265 stored_rc = cifs_lockv(xid, tcon,
1266 cfile->netfid,
1267 li->type, num,
1268 0, buf);
1269 if (stored_rc) {
1270 /*
1271 * We failed on the unlock range
1272 * request - add all locks from
1273 * the tmp list to the head of
1274 * the file's list.
1275 */
1276 cifs_move_llist(&tmp_llist,
1277 &cfile->llist);
1278 rc = stored_rc;
1279 } else
1280 /*
1281 * The unlock range request
1282 * succeed - free the tmp list.
1283 */
1284 cifs_free_llist(&tmp_llist);
1285 cur = buf;
1286 num = 0;
1287 } else
1288 cur++;
1289 } else {
1290 /* 1270 /*
1291 * We can cache brlock requests - simply remove 1271 * We can cache brlock requests - simply remove
1292 * a lock from the file's list. 1272 * a lock from the file's list.
@@ -1294,7 +1274,41 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
1294 list_del(&li->llist); 1274 list_del(&li->llist);
1295 cifs_del_lock_waiters(li); 1275 cifs_del_lock_waiters(li);
1296 kfree(li); 1276 kfree(li);
1277 continue;
1297 } 1278 }
1279 cur->Pid = cpu_to_le16(li->pid);
1280 cur->LengthLow = cpu_to_le32((u32)li->length);
1281 cur->LengthHigh = cpu_to_le32((u32)(li->length>>32));
1282 cur->OffsetLow = cpu_to_le32((u32)li->offset);
1283 cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32));
1284 /*
1285 * We need to save a lock here to let us add it again to
1286 * the file's list if the unlock range request fails on
1287 * the server.
1288 */
1289 list_move(&li->llist, &tmp_llist);
1290 if (++num == max_num) {
1291 stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
1292 li->type, num, 0, buf);
1293 if (stored_rc) {
1294 /*
1295 * We failed on the unlock range
1296 * request - add all locks from the tmp
1297 * list to the head of the file's list.
1298 */
1299 cifs_move_llist(&tmp_llist,
1300 &cfile->llist);
1301 rc = stored_rc;
1302 } else
1303 /*
1304 * The unlock range request succeed -
1305 * free the tmp list.
1306 */
1307 cifs_free_llist(&tmp_llist);
1308 cur = buf;
1309 num = 0;
1310 } else
1311 cur++;
1298 } 1312 }
1299 if (num) { 1313 if (num) {
1300 stored_rc = cifs_lockv(xid, tcon, cfile->netfid, 1314 stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index e2552d2b2e4..557506ae1e2 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -212,93 +212,6 @@ cifs_small_buf_release(void *buf_to_free)
212 return; 212 return;
213} 213}
214 214
215/*
216 * Find a free multiplex id (SMB mid). Otherwise there could be
217 * mid collisions which might cause problems, demultiplexing the
218 * wrong response to this request. Multiplex ids could collide if
219 * one of a series requests takes much longer than the others, or
220 * if a very large number of long lived requests (byte range
221 * locks or FindNotify requests) are pending. No more than
222 * 64K-1 requests can be outstanding at one time. If no
223 * mids are available, return zero. A future optimization
224 * could make the combination of mids and uid the key we use
225 * to demultiplex on (rather than mid alone).
226 * In addition to the above check, the cifs demultiplex
227 * code already used the command code as a secondary
228 * check of the frame and if signing is negotiated the
229 * response would be discarded if the mid were the same
230 * but the signature was wrong. Since the mid is not put in the
231 * pending queue until later (when it is about to be dispatched)
232 * we do have to limit the number of outstanding requests
233 * to somewhat less than 64K-1 although it is hard to imagine
234 * so many threads being in the vfs at one time.
235 */
236__u64 GetNextMid(struct TCP_Server_Info *server)
237{
238 __u64 mid = 0;
239 __u16 last_mid, cur_mid;
240 bool collision;
241
242 spin_lock(&GlobalMid_Lock);
243
244 /* mid is 16 bit only for CIFS/SMB */
245 cur_mid = (__u16)((server->CurrentMid) & 0xffff);
246 /* we do not want to loop forever */
247 last_mid = cur_mid;
248 cur_mid++;
249
250 /*
251 * This nested loop looks more expensive than it is.
252 * In practice the list of pending requests is short,
253 * fewer than 50, and the mids are likely to be unique
254 * on the first pass through the loop unless some request
255 * takes longer than the 64 thousand requests before it
256 * (and it would also have to have been a request that
257 * did not time out).
258 */
259 while (cur_mid != last_mid) {
260 struct mid_q_entry *mid_entry;
261 unsigned int num_mids;
262
263 collision = false;
264 if (cur_mid == 0)
265 cur_mid++;
266
267 num_mids = 0;
268 list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
269 ++num_mids;
270 if (mid_entry->mid == cur_mid &&
271 mid_entry->mid_state == MID_REQUEST_SUBMITTED) {
272 /* This mid is in use, try a different one */
273 collision = true;
274 break;
275 }
276 }
277
278 /*
279 * if we have more than 32k mids in the list, then something
280 * is very wrong. Possibly a local user is trying to DoS the
281 * box by issuing long-running calls and SIGKILL'ing them. If
282 * we get to 2^16 mids then we're in big trouble as this
283 * function could loop forever.
284 *
285 * Go ahead and assign out the mid in this situation, but force
286 * an eventual reconnect to clean out the pending_mid_q.
287 */
288 if (num_mids > 32768)
289 server->tcpStatus = CifsNeedReconnect;
290
291 if (!collision) {
292 mid = (__u64)cur_mid;
293 server->CurrentMid = mid;
294 break;
295 }
296 cur_mid++;
297 }
298 spin_unlock(&GlobalMid_Lock);
299 return mid;
300}
301
302/* NB: MID can not be set if treeCon not passed in, in that 215/* NB: MID can not be set if treeCon not passed in, in that
303 case it is responsbility of caller to set the mid */ 216 case it is responsbility of caller to set the mid */
304void 217void
@@ -334,7 +247,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
334 247
335 /* Uid is not converted */ 248 /* Uid is not converted */
336 buffer->Uid = treeCon->ses->Suid; 249 buffer->Uid = treeCon->ses->Suid;
337 buffer->Mid = GetNextMid(treeCon->ses->server); 250 buffer->Mid = get_next_mid(treeCon->ses->server);
338 } 251 }
339 if (treeCon->Flags & SMB_SHARE_IS_IN_DFS) 252 if (treeCon->Flags & SMB_SHARE_IS_IN_DFS)
340 buffer->Flags2 |= SMBFLG2_DFS; 253 buffer->Flags2 |= SMBFLG2_DFS;
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index d9d615fbed3..6dec38f5522 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -125,6 +125,94 @@ cifs_get_credits_field(struct TCP_Server_Info *server)
125 return &server->credits; 125 return &server->credits;
126} 126}
127 127
128/*
129 * Find a free multiplex id (SMB mid). Otherwise there could be
130 * mid collisions which might cause problems, demultiplexing the
131 * wrong response to this request. Multiplex ids could collide if
132 * one of a series requests takes much longer than the others, or
133 * if a very large number of long lived requests (byte range
134 * locks or FindNotify requests) are pending. No more than
135 * 64K-1 requests can be outstanding at one time. If no
136 * mids are available, return zero. A future optimization
137 * could make the combination of mids and uid the key we use
138 * to demultiplex on (rather than mid alone).
139 * In addition to the above check, the cifs demultiplex
140 * code already used the command code as a secondary
141 * check of the frame and if signing is negotiated the
142 * response would be discarded if the mid were the same
143 * but the signature was wrong. Since the mid is not put in the
144 * pending queue until later (when it is about to be dispatched)
145 * we do have to limit the number of outstanding requests
146 * to somewhat less than 64K-1 although it is hard to imagine
147 * so many threads being in the vfs at one time.
148 */
149static __u64
150cifs_get_next_mid(struct TCP_Server_Info *server)
151{
152 __u64 mid = 0;
153 __u16 last_mid, cur_mid;
154 bool collision;
155
156 spin_lock(&GlobalMid_Lock);
157
158 /* mid is 16 bit only for CIFS/SMB */
159 cur_mid = (__u16)((server->CurrentMid) & 0xffff);
160 /* we do not want to loop forever */
161 last_mid = cur_mid;
162 cur_mid++;
163
164 /*
165 * This nested loop looks more expensive than it is.
166 * In practice the list of pending requests is short,
167 * fewer than 50, and the mids are likely to be unique
168 * on the first pass through the loop unless some request
169 * takes longer than the 64 thousand requests before it
170 * (and it would also have to have been a request that
171 * did not time out).
172 */
173 while (cur_mid != last_mid) {
174 struct mid_q_entry *mid_entry;
175 unsigned int num_mids;
176
177 collision = false;
178 if (cur_mid == 0)
179 cur_mid++;
180
181 num_mids = 0;
182 list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
183 ++num_mids;
184 if (mid_entry->mid == cur_mid &&
185 mid_entry->mid_state == MID_REQUEST_SUBMITTED) {
186 /* This mid is in use, try a different one */
187 collision = true;
188 break;
189 }
190 }
191
192 /*
193 * if we have more than 32k mids in the list, then something
194 * is very wrong. Possibly a local user is trying to DoS the
195 * box by issuing long-running calls and SIGKILL'ing them. If
196 * we get to 2^16 mids then we're in big trouble as this
197 * function could loop forever.
198 *
199 * Go ahead and assign out the mid in this situation, but force
200 * an eventual reconnect to clean out the pending_mid_q.
201 */
202 if (num_mids > 32768)
203 server->tcpStatus = CifsNeedReconnect;
204
205 if (!collision) {
206 mid = (__u64)cur_mid;
207 server->CurrentMid = mid;
208 break;
209 }
210 cur_mid++;
211 }
212 spin_unlock(&GlobalMid_Lock);
213 return mid;
214}
215
128struct smb_version_operations smb1_operations = { 216struct smb_version_operations smb1_operations = {
129 .send_cancel = send_nt_cancel, 217 .send_cancel = send_nt_cancel,
130 .compare_fids = cifs_compare_fids, 218 .compare_fids = cifs_compare_fids,
@@ -133,6 +221,7 @@ struct smb_version_operations smb1_operations = {
133 .add_credits = cifs_add_credits, 221 .add_credits = cifs_add_credits,
134 .set_credits = cifs_set_credits, 222 .set_credits = cifs_set_credits,
135 .get_credits_field = cifs_get_credits_field, 223 .get_credits_field = cifs_get_credits_field,
224 .get_next_mid = cifs_get_next_mid,
136 .read_data_offset = cifs_read_data_offset, 225 .read_data_offset = cifs_read_data_offset,
137 .read_data_length = cifs_read_data_length, 226 .read_data_length = cifs_read_data_length,
138 .map_error = map_smb_to_linux_error, 227 .map_error = map_smb_to_linux_error,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 1b36ffe6a47..3097ee58fd7 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -779,7 +779,7 @@ send_lock_cancel(const unsigned int xid, struct cifs_tcon *tcon,
779 779
780 pSMB->LockType = LOCKING_ANDX_CANCEL_LOCK|LOCKING_ANDX_LARGE_FILES; 780 pSMB->LockType = LOCKING_ANDX_CANCEL_LOCK|LOCKING_ANDX_LARGE_FILES;
781 pSMB->Timeout = 0; 781 pSMB->Timeout = 0;
782 pSMB->hdr.Mid = GetNextMid(ses->server); 782 pSMB->hdr.Mid = get_next_mid(ses->server);
783 783
784 return SendReceive(xid, ses, in_buf, out_buf, 784 return SendReceive(xid, ses, in_buf, out_buf,
785 &bytes_returned, 0); 785 &bytes_returned, 0);
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
new file mode 100644
index 00000000000..0e4e2eec5c1
--- /dev/null
+++ b/include/linux/frontswap.h
@@ -0,0 +1,127 @@
1#ifndef _LINUX_FRONTSWAP_H
2#define _LINUX_FRONTSWAP_H
3
4#include <linux/swap.h>
5#include <linux/mm.h>
6#include <linux/bitops.h>
7
8struct frontswap_ops {
9 void (*init)(unsigned);
10 int (*store)(unsigned, pgoff_t, struct page *);
11 int (*load)(unsigned, pgoff_t, struct page *);
12 void (*invalidate_page)(unsigned, pgoff_t);
13 void (*invalidate_area)(unsigned);
14};
15
16extern bool frontswap_enabled;
17extern struct frontswap_ops
18 frontswap_register_ops(struct frontswap_ops *ops);
19extern void frontswap_shrink(unsigned long);
20extern unsigned long frontswap_curr_pages(void);
21extern void frontswap_writethrough(bool);
22
23extern void __frontswap_init(unsigned type);
24extern int __frontswap_store(struct page *page);
25extern int __frontswap_load(struct page *page);
26extern void __frontswap_invalidate_page(unsigned, pgoff_t);
27extern void __frontswap_invalidate_area(unsigned);
28
29#ifdef CONFIG_FRONTSWAP
30
31static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
32{
33 bool ret = false;
34
35 if (frontswap_enabled && sis->frontswap_map)
36 ret = test_bit(offset, sis->frontswap_map);
37 return ret;
38}
39
40static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
41{
42 if (frontswap_enabled && sis->frontswap_map)
43 set_bit(offset, sis->frontswap_map);
44}
45
46static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
47{
48 if (frontswap_enabled && sis->frontswap_map)
49 clear_bit(offset, sis->frontswap_map);
50}
51
52static inline void frontswap_map_set(struct swap_info_struct *p,
53 unsigned long *map)
54{
55 p->frontswap_map = map;
56}
57
58static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
59{
60 return p->frontswap_map;
61}
62#else
63/* all inline routines become no-ops and all externs are ignored */
64
65#define frontswap_enabled (0)
66
67static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
68{
69 return false;
70}
71
72static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
73{
74}
75
76static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
77{
78}
79
80static inline void frontswap_map_set(struct swap_info_struct *p,
81 unsigned long *map)
82{
83}
84
85static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
86{
87 return NULL;
88}
89#endif
90
91static inline int frontswap_store(struct page *page)
92{
93 int ret = -1;
94
95 if (frontswap_enabled)
96 ret = __frontswap_store(page);
97 return ret;
98}
99
100static inline int frontswap_load(struct page *page)
101{
102 int ret = -1;
103
104 if (frontswap_enabled)
105 ret = __frontswap_load(page);
106 return ret;
107}
108
109static inline void frontswap_invalidate_page(unsigned type, pgoff_t offset)
110{
111 if (frontswap_enabled)
112 __frontswap_invalidate_page(type, offset);
113}
114
115static inline void frontswap_invalidate_area(unsigned type)
116{
117 if (frontswap_enabled)
118 __frontswap_invalidate_area(type);
119}
120
121static inline void frontswap_init(unsigned type)
122{
123 if (frontswap_enabled)
124 __frontswap_init(type);
125}
126
127#endif /* _LINUX_FRONTSWAP_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b6661933e25..c84ec68eaec 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -197,6 +197,10 @@ struct swap_info_struct {
197 struct block_device *bdev; /* swap device or bdev of swap file */ 197 struct block_device *bdev; /* swap device or bdev of swap file */
198 struct file *swap_file; /* seldom referenced */ 198 struct file *swap_file; /* seldom referenced */
199 unsigned int old_block_size; /* seldom referenced */ 199 unsigned int old_block_size; /* seldom referenced */
200#ifdef CONFIG_FRONTSWAP
201 unsigned long *frontswap_map; /* frontswap in-use, one bit per page */
202 atomic_t frontswap_pages; /* frontswap pages in-use counter */
203#endif
200}; 204};
201 205
202struct swap_list_t { 206struct swap_list_t {
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
new file mode 100644
index 00000000000..e282624e8c1
--- /dev/null
+++ b/include/linux/swapfile.h
@@ -0,0 +1,13 @@
1#ifndef _LINUX_SWAPFILE_H
2#define _LINUX_SWAPFILE_H
3
4/*
5 * these were static in swapfile.c but frontswap.c needs them and we don't
6 * want to expose them to the dozens of source files that include swap.h
7 */
8extern spinlock_t swap_lock;
9extern struct swap_list_t swap_list;
10extern struct swap_info_struct *swap_info[];
11extern int try_to_unuse(unsigned int, bool, unsigned long);
12
13#endif /* _LINUX_SWAPFILE_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index b2176374b98..82fed4eb2b6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -389,3 +389,20 @@ config CLEANCACHE
389 in a negligible performance hit. 389 in a negligible performance hit.
390 390
391 If unsure, say Y to enable cleancache 391 If unsure, say Y to enable cleancache
392
393config FRONTSWAP
394 bool "Enable frontswap to cache swap pages if tmem is present"
395 depends on SWAP
396 default n
397 help
398 Frontswap is so named because it can be thought of as the opposite
399 of a "backing" store for a swap device. The data is stored into
400 "transcendent memory", memory that is not directly accessible or
401 addressable by the kernel and is of unknown and possibly
402 time-varying size. When space in transcendent memory is available,
403 a significant swap I/O reduction may be achieved. When none is
404 available, all frontswap calls are reduced to a single pointer-
405 compare-against-NULL resulting in a negligible performance hit
406 and swap data is stored as normal on the matching swap device.
407
408 If unsure, say Y to enable frontswap.
diff --git a/mm/Makefile b/mm/Makefile
index a156285ce88..2e2fbbefb99 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
29 29
30obj-$(CONFIG_BOUNCE) += bounce.o 30obj-$(CONFIG_BOUNCE) += bounce.o
31obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o 31obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
32obj-$(CONFIG_FRONTSWAP) += frontswap.o
32obj-$(CONFIG_HAS_DMA) += dmapool.o 33obj-$(CONFIG_HAS_DMA) += dmapool.o
33obj-$(CONFIG_HUGETLBFS) += hugetlb.o 34obj-$(CONFIG_HUGETLBFS) += hugetlb.o
34obj-$(CONFIG_NUMA) += mempolicy.o 35obj-$(CONFIG_NUMA) += mempolicy.o
diff --git a/mm/frontswap.c b/mm/frontswap.c
new file mode 100644
index 00000000000..e25025574a0
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,314 @@
1/*
2 * Frontswap frontend
3 *
4 * This code provides the generic "frontend" layer to call a matching
5 * "backend" driver implementation of frontswap. See
6 * Documentation/vm/frontswap.txt for more information.
7 *
8 * Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
9 * Author: Dan Magenheimer
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2.
12 */
13
14#include <linux/mm.h>
15#include <linux/mman.h>
16#include <linux/swap.h>
17#include <linux/swapops.h>
18#include <linux/proc_fs.h>
19#include <linux/security.h>
20#include <linux/capability.h>
21#include <linux/module.h>
22#include <linux/uaccess.h>
23#include <linux/debugfs.h>
24#include <linux/frontswap.h>
25#include <linux/swapfile.h>
26
27/*
28 * frontswap_ops is set by frontswap_register_ops to contain the pointers
29 * to the frontswap "backend" implementation functions.
30 */
31static struct frontswap_ops frontswap_ops __read_mostly;
32
33/*
34 * This global enablement flag reduces overhead on systems where frontswap_ops
35 * has not been registered, so is preferred to the slower alternative: a
36 * function call that checks a non-global.
37 */
38bool frontswap_enabled __read_mostly;
39EXPORT_SYMBOL(frontswap_enabled);
40
41/*
42 * If enabled, frontswap_store will return failure even on success. As
43 * a result, the swap subsystem will always write the page to swap, in
44 * effect converting frontswap into a writethrough cache. In this mode,
45 * there is no direct reduction in swap writes, but a frontswap backend
46 * can unilaterally "reclaim" any pages in use with no data loss, thus
47 * providing increases control over maximum memory usage due to frontswap.
48 */
49static bool frontswap_writethrough_enabled __read_mostly;
50
51#ifdef CONFIG_DEBUG_FS
52/*
53 * Counters available via /sys/kernel/debug/frontswap (if debugfs is
54 * properly configured). These are for information only so are not protected
55 * against increment races.
56 */
57static u64 frontswap_loads;
58static u64 frontswap_succ_stores;
59static u64 frontswap_failed_stores;
60static u64 frontswap_invalidates;
61
62static inline void inc_frontswap_loads(void) {
63 frontswap_loads++;
64}
65static inline void inc_frontswap_succ_stores(void) {
66 frontswap_succ_stores++;
67}
68static inline void inc_frontswap_failed_stores(void) {
69 frontswap_failed_stores++;
70}
71static inline void inc_frontswap_invalidates(void) {
72 frontswap_invalidates++;
73}
74#else
75static inline void inc_frontswap_loads(void) { }
76static inline void inc_frontswap_succ_stores(void) { }
77static inline void inc_frontswap_failed_stores(void) { }
78static inline void inc_frontswap_invalidates(void) { }
79#endif
80/*
81 * Register operations for frontswap, returning previous thus allowing
82 * detection of multiple backends and possible nesting.
83 */
84struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
85{
86 struct frontswap_ops old = frontswap_ops;
87
88 frontswap_ops = *ops;
89 frontswap_enabled = true;
90 return old;
91}
92EXPORT_SYMBOL(frontswap_register_ops);
93
94/*
95 * Enable/disable frontswap writethrough (see above).
96 */
97void frontswap_writethrough(bool enable)
98{
99 frontswap_writethrough_enabled = enable;
100}
101EXPORT_SYMBOL(frontswap_writethrough);
102
103/*
104 * Called when a swap device is swapon'd.
105 */
106void __frontswap_init(unsigned type)
107{
108 struct swap_info_struct *sis = swap_info[type];
109
110 BUG_ON(sis == NULL);
111 if (sis->frontswap_map == NULL)
112 return;
113 if (frontswap_enabled)
114 (*frontswap_ops.init)(type);
115}
116EXPORT_SYMBOL(__frontswap_init);
117
118/*
119 * "Store" data from a page to frontswap and associate it with the page's
120 * swaptype and offset. Page must be locked and in the swap cache.
121 * If frontswap already contains a page with matching swaptype and
122 * offset, the frontswap implmentation may either overwrite the data and
123 * return success or invalidate the page from frontswap and return failure.
124 */
125int __frontswap_store(struct page *page)
126{
127 int ret = -1, dup = 0;
128 swp_entry_t entry = { .val = page_private(page), };
129 int type = swp_type(entry);
130 struct swap_info_struct *sis = swap_info[type];
131 pgoff_t offset = swp_offset(entry);
132
133 BUG_ON(!PageLocked(page));
134 BUG_ON(sis == NULL);
135 if (frontswap_test(sis, offset))
136 dup = 1;
137 ret = (*frontswap_ops.store)(type, offset, page);
138 if (ret == 0) {
139 frontswap_set(sis, offset);
140 inc_frontswap_succ_stores();
141 if (!dup)
142 atomic_inc(&sis->frontswap_pages);
143 } else if (dup) {
144 /*
145 failed dup always results in automatic invalidate of
146 the (older) page from frontswap
147 */
148 frontswap_clear(sis, offset);
149 atomic_dec(&sis->frontswap_pages);
150 inc_frontswap_failed_stores();
151 } else
152 inc_frontswap_failed_stores();
153 if (frontswap_writethrough_enabled)
154 /* report failure so swap also writes to swap device */
155 ret = -1;
156 return ret;
157}
158EXPORT_SYMBOL(__frontswap_store);
159
160/*
161 * "Get" data from frontswap associated with swaptype and offset that were
162 * specified when the data was put to frontswap and use it to fill the
163 * specified page with data. Page must be locked and in the swap cache.
164 */
165int __frontswap_load(struct page *page)
166{
167 int ret = -1;
168 swp_entry_t entry = { .val = page_private(page), };
169 int type = swp_type(entry);
170 struct swap_info_struct *sis = swap_info[type];
171 pgoff_t offset = swp_offset(entry);
172
173 BUG_ON(!PageLocked(page));
174 BUG_ON(sis == NULL);
175 if (frontswap_test(sis, offset))
176 ret = (*frontswap_ops.load)(type, offset, page);
177 if (ret == 0)
178 inc_frontswap_loads();
179 return ret;
180}
181EXPORT_SYMBOL(__frontswap_load);
182
183/*
184 * Invalidate any data from frontswap associated with the specified swaptype
185 * and offset so that a subsequent "get" will fail.
186 */
187void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
188{
189 struct swap_info_struct *sis = swap_info[type];
190
191 BUG_ON(sis == NULL);
192 if (frontswap_test(sis, offset)) {
193 (*frontswap_ops.invalidate_page)(type, offset);
194 atomic_dec(&sis->frontswap_pages);
195 frontswap_clear(sis, offset);
196 inc_frontswap_invalidates();
197 }
198}
199EXPORT_SYMBOL(__frontswap_invalidate_page);
200
201/*
202 * Invalidate all data from frontswap associated with all offsets for the
203 * specified swaptype.
204 */
205void __frontswap_invalidate_area(unsigned type)
206{
207 struct swap_info_struct *sis = swap_info[type];
208
209 BUG_ON(sis == NULL);
210 if (sis->frontswap_map == NULL)
211 return;
212 (*frontswap_ops.invalidate_area)(type);
213 atomic_set(&sis->frontswap_pages, 0);
214 memset(sis->frontswap_map, 0, sis->max / sizeof(long));
215}
216EXPORT_SYMBOL(__frontswap_invalidate_area);
217
218/*
219 * Frontswap, like a true swap device, may unnecessarily retain pages
220 * under certain circumstances; "shrink" frontswap is essentially a
221 * "partial swapoff" and works by calling try_to_unuse to attempt to
222 * unuse enough frontswap pages to attempt to -- subject to memory
223 * constraints -- reduce the number of pages in frontswap to the
224 * number given in the parameter target_pages.
225 */
226void frontswap_shrink(unsigned long target_pages)
227{
228 struct swap_info_struct *si = NULL;
229 int si_frontswap_pages;
230 unsigned long total_pages = 0, total_pages_to_unuse;
231 unsigned long pages = 0, pages_to_unuse = 0;
232 int type;
233 bool locked = false;
234
235 /*
236 * we don't want to hold swap_lock while doing a very
237 * lengthy try_to_unuse, but swap_list may change
238 * so restart scan from swap_list.head each time
239 */
240 spin_lock(&swap_lock);
241 locked = true;
242 total_pages = 0;
243 for (type = swap_list.head; type >= 0; type = si->next) {
244 si = swap_info[type];
245 total_pages += atomic_read(&si->frontswap_pages);
246 }
247 if (total_pages <= target_pages)
248 goto out;
249 total_pages_to_unuse = total_pages - target_pages;
250 for (type = swap_list.head; type >= 0; type = si->next) {
251 si = swap_info[type];
252 si_frontswap_pages = atomic_read(&si->frontswap_pages);
253 if (total_pages_to_unuse < si_frontswap_pages)
254 pages = pages_to_unuse = total_pages_to_unuse;
255 else {
256 pages = si_frontswap_pages;
257 pages_to_unuse = 0; /* unuse all */
258 }
259 /* ensure there is enough RAM to fetch pages from frontswap */
260 if (security_vm_enough_memory_mm(current->mm, pages))
261 continue;
262 vm_unacct_memory(pages);
263 break;
264 }
265 if (type < 0)
266 goto out;
267 locked = false;
268 spin_unlock(&swap_lock);
269 try_to_unuse(type, true, pages_to_unuse);
270out:
271 if (locked)
272 spin_unlock(&swap_lock);
273 return;
274}
275EXPORT_SYMBOL(frontswap_shrink);
276
277/*
278 * Count and return the number of frontswap pages across all
279 * swap devices. This is exported so that backend drivers can
280 * determine current usage without reading debugfs.
281 */
282unsigned long frontswap_curr_pages(void)
283{
284 int type;
285 unsigned long totalpages = 0;
286 struct swap_info_struct *si = NULL;
287
288 spin_lock(&swap_lock);
289 for (type = swap_list.head; type >= 0; type = si->next) {
290 si = swap_info[type];
291 totalpages += atomic_read(&si->frontswap_pages);
292 }
293 spin_unlock(&swap_lock);
294 return totalpages;
295}
296EXPORT_SYMBOL(frontswap_curr_pages);
297
298static int __init init_frontswap(void)
299{
300#ifdef CONFIG_DEBUG_FS
301 struct dentry *root = debugfs_create_dir("frontswap", NULL);
302 if (root == NULL)
303 return -ENXIO;
304 debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads);
305 debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores);
306 debugfs_create_u64("failed_stores", S_IRUGO, root,
307 &frontswap_failed_stores);
308 debugfs_create_u64("invalidates", S_IRUGO,
309 root, &frontswap_invalidates);
310#endif
311 return 0;
312}
313
314module_init(init_frontswap);
diff --git a/mm/page_io.c b/mm/page_io.c
index dc76b4d0611..34f02923744 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -18,6 +18,7 @@
18#include <linux/bio.h> 18#include <linux/bio.h>
19#include <linux/swapops.h> 19#include <linux/swapops.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include <linux/frontswap.h>
21#include <asm/pgtable.h> 22#include <asm/pgtable.h>
22 23
23static struct bio *get_swap_bio(gfp_t gfp_flags, 24static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
98 unlock_page(page); 99 unlock_page(page);
99 goto out; 100 goto out;
100 } 101 }
102 if (frontswap_store(page) == 0) {
103 set_page_writeback(page);
104 unlock_page(page);
105 end_page_writeback(page);
106 goto out;
107 }
101 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); 108 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
102 if (bio == NULL) { 109 if (bio == NULL) {
103 set_page_dirty(page); 110 set_page_dirty(page);
@@ -122,6 +129,11 @@ int swap_readpage(struct page *page)
122 129
123 VM_BUG_ON(!PageLocked(page)); 130 VM_BUG_ON(!PageLocked(page));
124 VM_BUG_ON(PageUptodate(page)); 131 VM_BUG_ON(PageUptodate(page));
132 if (frontswap_load(page) == 0) {
133 SetPageUptodate(page);
134 unlock_page(page);
135 goto out;
136 }
125 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); 137 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
126 if (bio == NULL) { 138 if (bio == NULL) {
127 unlock_page(page); 139 unlock_page(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 457b10baef5..de5bc51c4a6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,6 +31,8 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/poll.h> 32#include <linux/poll.h>
33#include <linux/oom.h> 33#include <linux/oom.h>
34#include <linux/frontswap.h>
35#include <linux/swapfile.h>
34 36
35#include <asm/pgtable.h> 37#include <asm/pgtable.h>
36#include <asm/tlbflush.h> 38#include <asm/tlbflush.h>
@@ -42,7 +44,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
42static void free_swap_count_continuations(struct swap_info_struct *); 44static void free_swap_count_continuations(struct swap_info_struct *);
43static sector_t map_swap_entry(swp_entry_t, struct block_device**); 45static sector_t map_swap_entry(swp_entry_t, struct block_device**);
44 46
45static DEFINE_SPINLOCK(swap_lock); 47DEFINE_SPINLOCK(swap_lock);
46static unsigned int nr_swapfiles; 48static unsigned int nr_swapfiles;
47long nr_swap_pages; 49long nr_swap_pages;
48long total_swap_pages; 50long total_swap_pages;
@@ -53,9 +55,9 @@ static const char Unused_file[] = "Unused swap file entry ";
53static const char Bad_offset[] = "Bad swap offset entry "; 55static const char Bad_offset[] = "Bad swap offset entry ";
54static const char Unused_offset[] = "Unused swap offset entry "; 56static const char Unused_offset[] = "Unused swap offset entry ";
55 57
56static struct swap_list_t swap_list = {-1, -1}; 58struct swap_list_t swap_list = {-1, -1};
57 59
58static struct swap_info_struct *swap_info[MAX_SWAPFILES]; 60struct swap_info_struct *swap_info[MAX_SWAPFILES];
59 61
60static DEFINE_MUTEX(swapon_mutex); 62static DEFINE_MUTEX(swapon_mutex);
61 63
@@ -556,6 +558,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
556 swap_list.next = p->type; 558 swap_list.next = p->type;
557 nr_swap_pages++; 559 nr_swap_pages++;
558 p->inuse_pages--; 560 p->inuse_pages--;
561 frontswap_invalidate_page(p->type, offset);
559 if ((p->flags & SWP_BLKDEV) && 562 if ((p->flags & SWP_BLKDEV) &&
560 disk->fops->swap_slot_free_notify) 563 disk->fops->swap_slot_free_notify)
561 disk->fops->swap_slot_free_notify(p->bdev, offset); 564 disk->fops->swap_slot_free_notify(p->bdev, offset);
@@ -985,11 +988,12 @@ static int unuse_mm(struct mm_struct *mm,
985} 988}
986 989
987/* 990/*
988 * Scan swap_map from current position to next entry still in use. 991 * Scan swap_map (or frontswap_map if frontswap parameter is true)
992 * from current position to next entry still in use.
989 * Recycle to start on reaching the end, returning 0 when empty. 993 * Recycle to start on reaching the end, returning 0 when empty.
990 */ 994 */
991static unsigned int find_next_to_unuse(struct swap_info_struct *si, 995static unsigned int find_next_to_unuse(struct swap_info_struct *si,
992 unsigned int prev) 996 unsigned int prev, bool frontswap)
993{ 997{
994 unsigned int max = si->max; 998 unsigned int max = si->max;
995 unsigned int i = prev; 999 unsigned int i = prev;
@@ -1015,6 +1019,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1015 prev = 0; 1019 prev = 0;
1016 i = 1; 1020 i = 1;
1017 } 1021 }
1022 if (frontswap) {
1023 if (frontswap_test(si, i))
1024 break;
1025 else
1026 continue;
1027 }
1018 count = si->swap_map[i]; 1028 count = si->swap_map[i];
1019 if (count && swap_count(count) != SWAP_MAP_BAD) 1029 if (count && swap_count(count) != SWAP_MAP_BAD)
1020 break; 1030 break;
@@ -1026,8 +1036,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1026 * We completely avoid races by reading each swap page in advance, 1036 * We completely avoid races by reading each swap page in advance,
1027 * and then search for the process using it. All the necessary 1037 * and then search for the process using it. All the necessary
1028 * page table adjustments can then be made atomically. 1038 * page table adjustments can then be made atomically.
1039 *
1040 * if the boolean frontswap is true, only unuse pages_to_unuse pages;
1041 * pages_to_unuse==0 means all pages; ignored if frontswap is false
1029 */ 1042 */
1030static int try_to_unuse(unsigned int type) 1043int try_to_unuse(unsigned int type, bool frontswap,
1044 unsigned long pages_to_unuse)
1031{ 1045{
1032 struct swap_info_struct *si = swap_info[type]; 1046 struct swap_info_struct *si = swap_info[type];
1033 struct mm_struct *start_mm; 1047 struct mm_struct *start_mm;
@@ -1060,7 +1074,7 @@ static int try_to_unuse(unsigned int type)
1060 * one pass through swap_map is enough, but not necessarily: 1074 * one pass through swap_map is enough, but not necessarily:
1061 * there are races when an instance of an entry might be missed. 1075 * there are races when an instance of an entry might be missed.
1062 */ 1076 */
1063 while ((i = find_next_to_unuse(si, i)) != 0) { 1077 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
1064 if (signal_pending(current)) { 1078 if (signal_pending(current)) {
1065 retval = -EINTR; 1079 retval = -EINTR;
1066 break; 1080 break;
@@ -1227,6 +1241,10 @@ static int try_to_unuse(unsigned int type)
1227 * interactive performance. 1241 * interactive performance.
1228 */ 1242 */
1229 cond_resched(); 1243 cond_resched();
1244 if (frontswap && pages_to_unuse > 0) {
1245 if (!--pages_to_unuse)
1246 break;
1247 }
1230 } 1248 }
1231 1249
1232 mmput(start_mm); 1250 mmput(start_mm);
@@ -1486,7 +1504,8 @@ bad_bmap:
1486} 1504}
1487 1505
1488static void enable_swap_info(struct swap_info_struct *p, int prio, 1506static void enable_swap_info(struct swap_info_struct *p, int prio,
1489 unsigned char *swap_map) 1507 unsigned char *swap_map,
1508 unsigned long *frontswap_map)
1490{ 1509{
1491 int i, prev; 1510 int i, prev;
1492 1511
@@ -1496,6 +1515,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1496 else 1515 else
1497 p->prio = --least_priority; 1516 p->prio = --least_priority;
1498 p->swap_map = swap_map; 1517 p->swap_map = swap_map;
1518 frontswap_map_set(p, frontswap_map);
1499 p->flags |= SWP_WRITEOK; 1519 p->flags |= SWP_WRITEOK;
1500 nr_swap_pages += p->pages; 1520 nr_swap_pages += p->pages;
1501 total_swap_pages += p->pages; 1521 total_swap_pages += p->pages;
@@ -1512,6 +1532,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1512 swap_list.head = swap_list.next = p->type; 1532 swap_list.head = swap_list.next = p->type;
1513 else 1533 else
1514 swap_info[prev]->next = p->type; 1534 swap_info[prev]->next = p->type;
1535 frontswap_init(p->type);
1515 spin_unlock(&swap_lock); 1536 spin_unlock(&swap_lock);
1516} 1537}
1517 1538
@@ -1585,7 +1606,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1585 spin_unlock(&swap_lock); 1606 spin_unlock(&swap_lock);
1586 1607
1587 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); 1608 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1588 err = try_to_unuse(type); 1609 err = try_to_unuse(type, false, 0); /* force all pages to be unused */
1589 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); 1610 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
1590 1611
1591 if (err) { 1612 if (err) {
@@ -1596,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1596 * sys_swapoff for this swap_info_struct at this point. 1617 * sys_swapoff for this swap_info_struct at this point.
1597 */ 1618 */
1598 /* re-insert swap space back into swap_list */ 1619 /* re-insert swap space back into swap_list */
1599 enable_swap_info(p, p->prio, p->swap_map); 1620 enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
1600 goto out_dput; 1621 goto out_dput;
1601 } 1622 }
1602 1623
@@ -1622,9 +1643,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1622 swap_map = p->swap_map; 1643 swap_map = p->swap_map;
1623 p->swap_map = NULL; 1644 p->swap_map = NULL;
1624 p->flags = 0; 1645 p->flags = 0;
1646 frontswap_invalidate_area(type);
1625 spin_unlock(&swap_lock); 1647 spin_unlock(&swap_lock);
1626 mutex_unlock(&swapon_mutex); 1648 mutex_unlock(&swapon_mutex);
1627 vfree(swap_map); 1649 vfree(swap_map);
1650 vfree(frontswap_map_get(p));
1628 /* Destroy swap account informatin */ 1651 /* Destroy swap account informatin */
1629 swap_cgroup_swapoff(type); 1652 swap_cgroup_swapoff(type);
1630 1653
@@ -1988,6 +2011,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1988 sector_t span; 2011 sector_t span;
1989 unsigned long maxpages; 2012 unsigned long maxpages;
1990 unsigned char *swap_map = NULL; 2013 unsigned char *swap_map = NULL;
2014 unsigned long *frontswap_map = NULL;
1991 struct page *page = NULL; 2015 struct page *page = NULL;
1992 struct inode *inode = NULL; 2016 struct inode *inode = NULL;
1993 2017
@@ -2071,6 +2095,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2071 error = nr_extents; 2095 error = nr_extents;
2072 goto bad_swap; 2096 goto bad_swap;
2073 } 2097 }
2098 /* frontswap enabled? set up bit-per-page map for frontswap */
2099 if (frontswap_enabled)
2100 frontswap_map = vzalloc(maxpages / sizeof(long));
2074 2101
2075 if (p->bdev) { 2102 if (p->bdev) {
2076 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 2103 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2086,14 +2113,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2086 if (swap_flags & SWAP_FLAG_PREFER) 2113 if (swap_flags & SWAP_FLAG_PREFER)
2087 prio = 2114 prio =
2088 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 2115 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2089 enable_swap_info(p, prio, swap_map); 2116 enable_swap_info(p, prio, swap_map, frontswap_map);
2090 2117
2091 printk(KERN_INFO "Adding %uk swap on %s. " 2118 printk(KERN_INFO "Adding %uk swap on %s. "
2092 "Priority:%d extents:%d across:%lluk %s%s\n", 2119 "Priority:%d extents:%d across:%lluk %s%s%s\n",
2093 p->pages<<(PAGE_SHIFT-10), name, p->prio, 2120 p->pages<<(PAGE_SHIFT-10), name, p->prio,
2094 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2121 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2095 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2122 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2096 (p->flags & SWP_DISCARDABLE) ? "D" : ""); 2123 (p->flags & SWP_DISCARDABLE) ? "D" : "",
2124 (frontswap_map) ? "FS" : "");
2097 2125
2098 mutex_unlock(&swapon_mutex); 2126 mutex_unlock(&swapon_mutex);
2099 atomic_inc(&proc_poll_event); 2127 atomic_inc(&proc_poll_event);