diff options
| -rw-r--r-- | Documentation/vm/frontswap.txt | 278 | ||||
| -rw-r--r-- | MAINTAINERS | 7 | ||||
| -rw-r--r-- | drivers/staging/ramster/zcache-main.c | 8 | ||||
| -rw-r--r-- | drivers/staging/zcache/zcache-main.c | 10 | ||||
| -rw-r--r-- | drivers/xen/tmem.c | 8 | ||||
| -rw-r--r-- | fs/cifs/cifsglob.h | 7 | ||||
| -rw-r--r-- | fs/cifs/cifsproto.h | 1 | ||||
| -rw-r--r-- | fs/cifs/cifssmb.c | 8 | ||||
| -rw-r--r-- | fs/cifs/connect.c | 8 | ||||
| -rw-r--r-- | fs/cifs/file.c | 106 | ||||
| -rw-r--r-- | fs/cifs/misc.c | 89 | ||||
| -rw-r--r-- | fs/cifs/smb1ops.c | 89 | ||||
| -rw-r--r-- | fs/cifs/transport.c | 2 | ||||
| -rw-r--r-- | include/linux/frontswap.h | 127 | ||||
| -rw-r--r-- | include/linux/swap.h | 4 | ||||
| -rw-r--r-- | include/linux/swapfile.h | 13 | ||||
| -rw-r--r-- | mm/Kconfig | 17 | ||||
| -rw-r--r-- | mm/Makefile | 1 | ||||
| -rw-r--r-- | mm/frontswap.c | 314 | ||||
| -rw-r--r-- | mm/nommu.c | 2 | ||||
| -rw-r--r-- | mm/page_io.c | 12 | ||||
| -rw-r--r-- | mm/swapfile.c | 54 |
22 files changed, 995 insertions, 170 deletions
diff --git a/Documentation/vm/frontswap.txt b/Documentation/vm/frontswap.txt new file mode 100644 index 000000000000..37067cf455f4 --- /dev/null +++ b/Documentation/vm/frontswap.txt | |||
| @@ -0,0 +1,278 @@ | |||
| 1 | Frontswap provides a "transcendent memory" interface for swap pages. | ||
| 2 | In some environments, dramatic performance savings may be obtained because | ||
| 3 | swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk. | ||
| 4 | |||
| 5 | (Note, frontswap -- and cleancache (merged at 3.0) -- are the "frontends" | ||
| 6 | and the only necessary changes to the core kernel for transcendent memory; | ||
| 7 | all other supporting code -- the "backends" -- is implemented as drivers. | ||
| 8 | See the LWN.net article "Transcendent memory in a nutshell" for a detailed | ||
| 9 | overview of frontswap and related kernel parts: | ||
| 10 | https://lwn.net/Articles/454795/ ) | ||
| 11 | |||
| 12 | Frontswap is so named because it can be thought of as the opposite of | ||
| 13 | a "backing" store for a swap device. The storage is assumed to be | ||
| 14 | a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming | ||
| 15 | to the requirements of transcendent memory (such as Xen's "tmem", or | ||
| 16 | in-kernel compressed memory, aka "zcache", or future RAM-like devices); | ||
| 17 | this pseudo-RAM device is not directly accessible or addressable by the | ||
| 18 | kernel and is of unknown and possibly time-varying size. The driver | ||
| 19 | links itself to frontswap by calling frontswap_register_ops to set the | ||
| 20 | frontswap_ops funcs appropriately and the functions it provides must | ||
| 21 | conform to certain policies as follows: | ||
| 22 | |||
| 23 | An "init" prepares the device to receive frontswap pages associated | ||
| 24 | with the specified swap device number (aka "type"). A "store" will | ||
| 25 | copy the page to transcendent memory and associate it with the type and | ||
| 26 | offset associated with the page. A "load" will copy the page, if found, | ||
| 27 | from transcendent memory into kernel memory, but will NOT remove the page | ||
| 28 | from from transcendent memory. An "invalidate_page" will remove the page | ||
| 29 | from transcendent memory and an "invalidate_area" will remove ALL pages | ||
| 30 | associated with the swap type (e.g., like swapoff) and notify the "device" | ||
| 31 | to refuse further stores with that swap type. | ||
| 32 | |||
| 33 | Once a page is successfully stored, a matching load on the page will normally | ||
| 34 | succeed. So when the kernel finds itself in a situation where it needs | ||
| 35 | to swap out a page, it first attempts to use frontswap. If the store returns | ||
| 36 | success, the data has been successfully saved to transcendent memory and | ||
| 37 | a disk write and, if the data is later read back, a disk read are avoided. | ||
| 38 | If a store returns failure, transcendent memory has rejected the data, and the | ||
| 39 | page can be written to swap as usual. | ||
| 40 | |||
| 41 | If a backend chooses, frontswap can be configured as a "writethrough | ||
| 42 | cache" by calling frontswap_writethrough(). In this mode, the reduction | ||
| 43 | in swap device writes is lost (and also a non-trivial performance advantage) | ||
| 44 | in order to allow the backend to arbitrarily "reclaim" space used to | ||
| 45 | store frontswap pages to more completely manage its memory usage. | ||
| 46 | |||
| 47 | Note that if a page is stored and the page already exists in transcendent memory | ||
| 48 | (a "duplicate" store), either the store succeeds and the data is overwritten, | ||
| 49 | or the store fails AND the page is invalidated. This ensures stale data may | ||
| 50 | never be obtained from frontswap. | ||
| 51 | |||
| 52 | If properly configured, monitoring of frontswap is done via debugfs in | ||
| 53 | the /sys/kernel/debug/frontswap directory. The effectiveness of | ||
| 54 | frontswap can be measured (across all swap devices) with: | ||
| 55 | |||
| 56 | failed_stores - how many store attempts have failed | ||
| 57 | loads - how many loads were attempted (all should succeed) | ||
| 58 | succ_stores - how many store attempts have succeeded | ||
| 59 | invalidates - how many invalidates were attempted | ||
| 60 | |||
| 61 | A backend implementation may provide additional metrics. | ||
| 62 | |||
| 63 | FAQ | ||
| 64 | |||
| 65 | 1) Where's the value? | ||
| 66 | |||
| 67 | When a workload starts swapping, performance falls through the floor. | ||
| 68 | Frontswap significantly increases performance in many such workloads by | ||
| 69 | providing a clean, dynamic interface to read and write swap pages to | ||
| 70 | "transcendent memory" that is otherwise not directly addressable to the kernel. | ||
| 71 | This interface is ideal when data is transformed to a different form | ||
| 72 | and size (such as with compression) or secretly moved (as might be | ||
| 73 | useful for write-balancing for some RAM-like devices). Swap pages (and | ||
| 74 | evicted page-cache pages) are a great use for this kind of slower-than-RAM- | ||
| 75 | but-much-faster-than-disk "pseudo-RAM device" and the frontswap (and | ||
| 76 | cleancache) interface to transcendent memory provides a nice way to read | ||
| 77 | and write -- and indirectly "name" -- the pages. | ||
| 78 | |||
| 79 | Frontswap -- and cleancache -- with a fairly small impact on the kernel, | ||
| 80 | provides a huge amount of flexibility for more dynamic, flexible RAM | ||
| 81 | utilization in various system configurations: | ||
| 82 | |||
| 83 | In the single kernel case, aka "zcache", pages are compressed and | ||
| 84 | stored in local memory, thus increasing the total anonymous pages | ||
| 85 | that can be safely kept in RAM. Zcache essentially trades off CPU | ||
| 86 | cycles used in compression/decompression for better memory utilization. | ||
| 87 | Benchmarks have shown little or no impact when memory pressure is | ||
| 88 | low while providing a significant performance improvement (25%+) | ||
| 89 | on some workloads under high memory pressure. | ||
| 90 | |||
| 91 | "RAMster" builds on zcache by adding "peer-to-peer" transcendent memory | ||
| 92 | support for clustered systems. Frontswap pages are locally compressed | ||
| 93 | as in zcache, but then "remotified" to another system's RAM. This | ||
| 94 | allows RAM to be dynamically load-balanced back-and-forth as needed, | ||
| 95 | i.e. when system A is overcommitted, it can swap to system B, and | ||
| 96 | vice versa. RAMster can also be configured as a memory server so | ||
| 97 | many servers in a cluster can swap, dynamically as needed, to a single | ||
| 98 | server configured with a large amount of RAM... without pre-configuring | ||
| 99 | how much of the RAM is available for each of the clients! | ||
| 100 | |||
| 101 | In the virtual case, the whole point of virtualization is to statistically | ||
| 102 | multiplex physical resources acrosst the varying demands of multiple | ||
| 103 | virtual machines. This is really hard to do with RAM and efforts to do | ||
| 104 | it well with no kernel changes have essentially failed (except in some | ||
| 105 | well-publicized special-case workloads). | ||
| 106 | Specifically, the Xen Transcendent Memory backend allows otherwise | ||
| 107 | "fallow" hypervisor-owned RAM to not only be "time-shared" between multiple | ||
| 108 | virtual machines, but the pages can be compressed and deduplicated to | ||
| 109 | optimize RAM utilization. And when guest OS's are induced to surrender | ||
| 110 | underutilized RAM (e.g. with "selfballooning"), sudden unexpected | ||
| 111 | memory pressure may result in swapping; frontswap allows those pages | ||
| 112 | to be swapped to and from hypervisor RAM (if overall host system memory | ||
| 113 | conditions allow), thus mitigating the potentially awful performance impact | ||
| 114 | of unplanned swapping. | ||
| 115 | |||
| 116 | A KVM implementation is underway and has been RFC'ed to lkml. And, | ||
| 117 | using frontswap, investigation is also underway on the use of NVM as | ||
| 118 | a memory extension technology. | ||
| 119 | |||
| 120 | 2) Sure there may be performance advantages in some situations, but | ||
| 121 | what's the space/time overhead of frontswap? | ||
| 122 | |||
| 123 | If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into | ||
| 124 | nothingness and the only overhead is a few extra bytes per swapon'ed | ||
| 125 | swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend" | ||
| 126 | registers, there is one extra global variable compared to zero for | ||
| 127 | every swap page read or written. If CONFIG_FRONTSWAP is enabled | ||
| 128 | AND a frontswap backend registers AND the backend fails every "store" | ||
| 129 | request (i.e. provides no memory despite claiming it might), | ||
| 130 | CPU overhead is still negligible -- and since every frontswap fail | ||
| 131 | precedes a swap page write-to-disk, the system is highly likely | ||
| 132 | to be I/O bound and using a small fraction of a percent of a CPU | ||
| 133 | will be irrelevant anyway. | ||
| 134 | |||
| 135 | As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend | ||
| 136 | registers, one bit is allocated for every swap page for every swap | ||
| 137 | device that is swapon'd. This is added to the EIGHT bits (which | ||
| 138 | was sixteen until about 2.6.34) that the kernel already allocates | ||
| 139 | for every swap page for every swap device that is swapon'd. (Hugh | ||
| 140 | Dickins has observed that frontswap could probably steal one of | ||
| 141 | the existing eight bits, but let's worry about that minor optimization | ||
| 142 | later.) For very large swap disks (which are rare) on a standard | ||
| 143 | 4K pagesize, this is 1MB per 32GB swap. | ||
| 144 | |||
| 145 | When swap pages are stored in transcendent memory instead of written | ||
| 146 | out to disk, there is a side effect that this may create more memory | ||
| 147 | pressure that can potentially outweigh the other advantages. A | ||
| 148 | backend, such as zcache, must implement policies to carefully (but | ||
| 149 | dynamically) manage memory limits to ensure this doesn't happen. | ||
| 150 | |||
| 151 | 3) OK, how about a quick overview of what this frontswap patch does | ||
| 152 | in terms that a kernel hacker can grok? | ||
| 153 | |||
| 154 | Let's assume that a frontswap "backend" has registered during | ||
| 155 | kernel initialization; this registration indicates that this | ||
| 156 | frontswap backend has access to some "memory" that is not directly | ||
| 157 | accessible by the kernel. Exactly how much memory it provides is | ||
| 158 | entirely dynamic and random. | ||
| 159 | |||
| 160 | Whenever a swap-device is swapon'd frontswap_init() is called, | ||
| 161 | passing the swap device number (aka "type") as a parameter. | ||
| 162 | This notifies frontswap to expect attempts to "store" swap pages | ||
| 163 | associated with that number. | ||
| 164 | |||
| 165 | Whenever the swap subsystem is readying a page to write to a swap | ||
| 166 | device (c.f swap_writepage()), frontswap_store is called. Frontswap | ||
| 167 | consults with the frontswap backend and if the backend says it does NOT | ||
| 168 | have room, frontswap_store returns -1 and the kernel swaps the page | ||
| 169 | to the swap device as normal. Note that the response from the frontswap | ||
| 170 | backend is unpredictable to the kernel; it may choose to never accept a | ||
| 171 | page, it could accept every ninth page, or it might accept every | ||
| 172 | page. But if the backend does accept a page, the data from the page | ||
| 173 | has already been copied and associated with the type and offset, | ||
| 174 | and the backend guarantees the persistence of the data. In this case, | ||
| 175 | frontswap sets a bit in the "frontswap_map" for the swap device | ||
| 176 | corresponding to the page offset on the swap device to which it would | ||
| 177 | otherwise have written the data. | ||
| 178 | |||
| 179 | When the swap subsystem needs to swap-in a page (swap_readpage()), | ||
| 180 | it first calls frontswap_load() which checks the frontswap_map to | ||
| 181 | see if the page was earlier accepted by the frontswap backend. If | ||
| 182 | it was, the page of data is filled from the frontswap backend and | ||
| 183 | the swap-in is complete. If not, the normal swap-in code is | ||
| 184 | executed to obtain the page of data from the real swap device. | ||
| 185 | |||
| 186 | So every time the frontswap backend accepts a page, a swap device read | ||
| 187 | and (potentially) a swap device write are replaced by a "frontswap backend | ||
| 188 | store" and (possibly) a "frontswap backend loads", which are presumably much | ||
| 189 | faster. | ||
| 190 | |||
| 191 | 4) Can't frontswap be configured as a "special" swap device that is | ||
| 192 | just higher priority than any real swap device (e.g. like zswap, | ||
| 193 | or maybe swap-over-nbd/NFS)? | ||
| 194 | |||
| 195 | No. First, the existing swap subsystem doesn't allow for any kind of | ||
| 196 | swap hierarchy. Perhaps it could be rewritten to accomodate a hierarchy, | ||
| 197 | but this would require fairly drastic changes. Even if it were | ||
| 198 | rewritten, the existing swap subsystem uses the block I/O layer which | ||
| 199 | assumes a swap device is fixed size and any page in it is linearly | ||
| 200 | addressable. Frontswap barely touches the existing swap subsystem, | ||
| 201 | and works around the constraints of the block I/O subsystem to provide | ||
| 202 | a great deal of flexibility and dynamicity. | ||
| 203 | |||
| 204 | For example, the acceptance of any swap page by the frontswap backend is | ||
| 205 | entirely unpredictable. This is critical to the definition of frontswap | ||
| 206 | backends because it grants completely dynamic discretion to the | ||
| 207 | backend. In zcache, one cannot know a priori how compressible a page is. | ||
| 208 | "Poorly" compressible pages can be rejected, and "poorly" can itself be | ||
| 209 | defined dynamically depending on current memory constraints. | ||
| 210 | |||
| 211 | Further, frontswap is entirely synchronous whereas a real swap | ||
| 212 | device is, by definition, asynchronous and uses block I/O. The | ||
| 213 | block I/O layer is not only unnecessary, but may perform "optimizations" | ||
| 214 | that are inappropriate for a RAM-oriented device including delaying | ||
| 215 | the write of some pages for a significant amount of time. Synchrony is | ||
| 216 | required to ensure the dynamicity of the backend and to avoid thorny race | ||
| 217 | conditions that would unnecessarily and greatly complicate frontswap | ||
| 218 | and/or the block I/O subsystem. That said, only the initial "store" | ||
| 219 | and "load" operations need be synchronous. A separate asynchronous thread | ||
| 220 | is free to manipulate the pages stored by frontswap. For example, | ||
| 221 | the "remotification" thread in RAMster uses standard asynchronous | ||
| 222 | kernel sockets to move compressed frontswap pages to a remote machine. | ||
| 223 | Similarly, a KVM guest-side implementation could do in-guest compression | ||
| 224 | and use "batched" hypercalls. | ||
| 225 | |||
| 226 | In a virtualized environment, the dynamicity allows the hypervisor | ||
| 227 | (or host OS) to do "intelligent overcommit". For example, it can | ||
| 228 | choose to accept pages only until host-swapping might be imminent, | ||
| 229 | then force guests to do their own swapping. | ||
| 230 | |||
| 231 | There is a downside to the transcendent memory specifications for | ||
| 232 | frontswap: Since any "store" might fail, there must always be a real | ||
| 233 | slot on a real swap device to swap the page. Thus frontswap must be | ||
| 234 | implemented as a "shadow" to every swapon'd device with the potential | ||
| 235 | capability of holding every page that the swap device might have held | ||
| 236 | and the possibility that it might hold no pages at all. This means | ||
| 237 | that frontswap cannot contain more pages than the total of swapon'd | ||
| 238 | swap devices. For example, if NO swap device is configured on some | ||
| 239 | installation, frontswap is useless. Swapless portable devices | ||
| 240 | can still use frontswap but a backend for such devices must configure | ||
| 241 | some kind of "ghost" swap device and ensure that it is never used. | ||
| 242 | |||
| 243 | 5) Why this weird definition about "duplicate stores"? If a page | ||
| 244 | has been previously successfully stored, can't it always be | ||
| 245 | successfully overwritten? | ||
| 246 | |||
| 247 | Nearly always it can, but no, sometimes it cannot. Consider an example | ||
| 248 | where data is compressed and the original 4K page has been compressed | ||
| 249 | to 1K. Now an attempt is made to overwrite the page with data that | ||
| 250 | is non-compressible and so would take the entire 4K. But the backend | ||
| 251 | has no more space. In this case, the store must be rejected. Whenever | ||
| 252 | frontswap rejects a store that would overwrite, it also must invalidate | ||
| 253 | the old data and ensure that it is no longer accessible. Since the | ||
| 254 | swap subsystem then writes the new data to the read swap device, | ||
| 255 | this is the correct course of action to ensure coherency. | ||
| 256 | |||
| 257 | 6) What is frontswap_shrink for? | ||
| 258 | |||
| 259 | When the (non-frontswap) swap subsystem swaps out a page to a real | ||
| 260 | swap device, that page is only taking up low-value pre-allocated disk | ||
| 261 | space. But if frontswap has placed a page in transcendent memory, that | ||
| 262 | page may be taking up valuable real estate. The frontswap_shrink | ||
| 263 | routine allows code outside of the swap subsystem to force pages out | ||
| 264 | of the memory managed by frontswap and back into kernel-addressable memory. | ||
| 265 | For example, in RAMster, a "suction driver" thread will attempt | ||
| 266 | to "repatriate" pages sent to a remote machine back to the local machine; | ||
| 267 | this is driven using the frontswap_shrink mechanism when memory pressure | ||
| 268 | subsides. | ||
| 269 | |||
| 270 | 7) Why does the frontswap patch create the new include file swapfile.h? | ||
| 271 | |||
| 272 | The frontswap code depends on some swap-subsystem-internal data | ||
| 273 | structures that have, over the years, moved back and forth between | ||
| 274 | static and global. This seemed a reasonable compromise: Define | ||
| 275 | them as global but declare them in a new include file that isn't | ||
| 276 | included by the large number of source files that include swap.h. | ||
| 277 | |||
| 278 | Dan Magenheimer, last updated April 9, 2012 | ||
diff --git a/MAINTAINERS b/MAINTAINERS index 55f0fda602ec..6a52bb4a4fc7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
| @@ -2930,6 +2930,13 @@ F: Documentation/power/freezing-of-tasks.txt | |||
| 2930 | F: include/linux/freezer.h | 2930 | F: include/linux/freezer.h |
| 2931 | F: kernel/freezer.c | 2931 | F: kernel/freezer.c |
| 2932 | 2932 | ||
| 2933 | FRONTSWAP API | ||
| 2934 | M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> | ||
| 2935 | L: linux-kernel@vger.kernel.org | ||
| 2936 | S: Maintained | ||
| 2937 | F: mm/frontswap.c | ||
| 2938 | F: include/linux/frontswap.h | ||
| 2939 | |||
| 2933 | FS-CACHE: LOCAL CACHING FOR NETWORK FILESYSTEMS | 2940 | FS-CACHE: LOCAL CACHING FOR NETWORK FILESYSTEMS |
| 2934 | M: David Howells <dhowells@redhat.com> | 2941 | M: David Howells <dhowells@redhat.com> |
| 2935 | L: linux-cachefs@redhat.com | 2942 | L: linux-cachefs@redhat.com |
diff --git a/drivers/staging/ramster/zcache-main.c b/drivers/staging/ramster/zcache-main.c index 4e7ef0e6b79c..d46764b5aaba 100644 --- a/drivers/staging/ramster/zcache-main.c +++ b/drivers/staging/ramster/zcache-main.c | |||
| @@ -3002,7 +3002,7 @@ static inline struct tmem_oid oswiz(unsigned type, u32 ind) | |||
| 3002 | return oid; | 3002 | return oid; |
| 3003 | } | 3003 | } |
| 3004 | 3004 | ||
| 3005 | static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, | 3005 | static int zcache_frontswap_store(unsigned type, pgoff_t offset, |
| 3006 | struct page *page) | 3006 | struct page *page) |
| 3007 | { | 3007 | { |
| 3008 | u64 ind64 = (u64)offset; | 3008 | u64 ind64 = (u64)offset; |
| @@ -3025,7 +3025,7 @@ static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, | |||
| 3025 | 3025 | ||
| 3026 | /* returns 0 if the page was successfully gotten from frontswap, -1 if | 3026 | /* returns 0 if the page was successfully gotten from frontswap, -1 if |
| 3027 | * was not present (should never happen!) */ | 3027 | * was not present (should never happen!) */ |
| 3028 | static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, | 3028 | static int zcache_frontswap_load(unsigned type, pgoff_t offset, |
| 3029 | struct page *page) | 3029 | struct page *page) |
| 3030 | { | 3030 | { |
| 3031 | u64 ind64 = (u64)offset; | 3031 | u64 ind64 = (u64)offset; |
| @@ -3080,8 +3080,8 @@ static void zcache_frontswap_init(unsigned ignored) | |||
| 3080 | } | 3080 | } |
| 3081 | 3081 | ||
| 3082 | static struct frontswap_ops zcache_frontswap_ops = { | 3082 | static struct frontswap_ops zcache_frontswap_ops = { |
| 3083 | .put_page = zcache_frontswap_put_page, | 3083 | .store = zcache_frontswap_store, |
| 3084 | .get_page = zcache_frontswap_get_page, | 3084 | .load = zcache_frontswap_load, |
| 3085 | .invalidate_page = zcache_frontswap_flush_page, | 3085 | .invalidate_page = zcache_frontswap_flush_page, |
| 3086 | .invalidate_area = zcache_frontswap_flush_area, | 3086 | .invalidate_area = zcache_frontswap_flush_area, |
| 3087 | .init = zcache_frontswap_init | 3087 | .init = zcache_frontswap_init |
diff --git a/drivers/staging/zcache/zcache-main.c b/drivers/staging/zcache/zcache-main.c index 2734dacacbaf..784c796b9848 100644 --- a/drivers/staging/zcache/zcache-main.c +++ b/drivers/staging/zcache/zcache-main.c | |||
| @@ -1835,7 +1835,7 @@ static int zcache_frontswap_poolid = -1; | |||
| 1835 | * Swizzling increases objects per swaptype, increasing tmem concurrency | 1835 | * Swizzling increases objects per swaptype, increasing tmem concurrency |
| 1836 | * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS | 1836 | * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS |
| 1837 | * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from | 1837 | * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from |
| 1838 | * frontswap_get_page(), but has side-effects. Hence using 8. | 1838 | * frontswap_load(), but has side-effects. Hence using 8. |
| 1839 | */ | 1839 | */ |
| 1840 | #define SWIZ_BITS 8 | 1840 | #define SWIZ_BITS 8 |
| 1841 | #define SWIZ_MASK ((1 << SWIZ_BITS) - 1) | 1841 | #define SWIZ_MASK ((1 << SWIZ_BITS) - 1) |
| @@ -1849,7 +1849,7 @@ static inline struct tmem_oid oswiz(unsigned type, u32 ind) | |||
| 1849 | return oid; | 1849 | return oid; |
| 1850 | } | 1850 | } |
| 1851 | 1851 | ||
| 1852 | static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, | 1852 | static int zcache_frontswap_store(unsigned type, pgoff_t offset, |
| 1853 | struct page *page) | 1853 | struct page *page) |
| 1854 | { | 1854 | { |
| 1855 | u64 ind64 = (u64)offset; | 1855 | u64 ind64 = (u64)offset; |
| @@ -1870,7 +1870,7 @@ static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, | |||
| 1870 | 1870 | ||
| 1871 | /* returns 0 if the page was successfully gotten from frontswap, -1 if | 1871 | /* returns 0 if the page was successfully gotten from frontswap, -1 if |
| 1872 | * was not present (should never happen!) */ | 1872 | * was not present (should never happen!) */ |
| 1873 | static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, | 1873 | static int zcache_frontswap_load(unsigned type, pgoff_t offset, |
| 1874 | struct page *page) | 1874 | struct page *page) |
| 1875 | { | 1875 | { |
| 1876 | u64 ind64 = (u64)offset; | 1876 | u64 ind64 = (u64)offset; |
| @@ -1919,8 +1919,8 @@ static void zcache_frontswap_init(unsigned ignored) | |||
| 1919 | } | 1919 | } |
| 1920 | 1920 | ||
| 1921 | static struct frontswap_ops zcache_frontswap_ops = { | 1921 | static struct frontswap_ops zcache_frontswap_ops = { |
| 1922 | .put_page = zcache_frontswap_put_page, | 1922 | .store = zcache_frontswap_store, |
| 1923 | .get_page = zcache_frontswap_get_page, | 1923 | .load = zcache_frontswap_load, |
| 1924 | .invalidate_page = zcache_frontswap_flush_page, | 1924 | .invalidate_page = zcache_frontswap_flush_page, |
| 1925 | .invalidate_area = zcache_frontswap_flush_area, | 1925 | .invalidate_area = zcache_frontswap_flush_area, |
| 1926 | .init = zcache_frontswap_init | 1926 | .init = zcache_frontswap_init |
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c index dcb79521e6c8..89f264c67420 100644 --- a/drivers/xen/tmem.c +++ b/drivers/xen/tmem.c | |||
| @@ -269,7 +269,7 @@ static inline struct tmem_oid oswiz(unsigned type, u32 ind) | |||
| 269 | } | 269 | } |
| 270 | 270 | ||
| 271 | /* returns 0 if the page was successfully put into frontswap, -1 if not */ | 271 | /* returns 0 if the page was successfully put into frontswap, -1 if not */ |
| 272 | static int tmem_frontswap_put_page(unsigned type, pgoff_t offset, | 272 | static int tmem_frontswap_store(unsigned type, pgoff_t offset, |
| 273 | struct page *page) | 273 | struct page *page) |
| 274 | { | 274 | { |
| 275 | u64 ind64 = (u64)offset; | 275 | u64 ind64 = (u64)offset; |
| @@ -295,7 +295,7 @@ static int tmem_frontswap_put_page(unsigned type, pgoff_t offset, | |||
| 295 | * returns 0 if the page was successfully gotten from frontswap, -1 if | 295 | * returns 0 if the page was successfully gotten from frontswap, -1 if |
| 296 | * was not present (should never happen!) | 296 | * was not present (should never happen!) |
| 297 | */ | 297 | */ |
| 298 | static int tmem_frontswap_get_page(unsigned type, pgoff_t offset, | 298 | static int tmem_frontswap_load(unsigned type, pgoff_t offset, |
| 299 | struct page *page) | 299 | struct page *page) |
| 300 | { | 300 | { |
| 301 | u64 ind64 = (u64)offset; | 301 | u64 ind64 = (u64)offset; |
| @@ -362,8 +362,8 @@ static int __init no_frontswap(char *s) | |||
| 362 | __setup("nofrontswap", no_frontswap); | 362 | __setup("nofrontswap", no_frontswap); |
| 363 | 363 | ||
| 364 | static struct frontswap_ops __initdata tmem_frontswap_ops = { | 364 | static struct frontswap_ops __initdata tmem_frontswap_ops = { |
| 365 | .put_page = tmem_frontswap_put_page, | 365 | .store = tmem_frontswap_store, |
| 366 | .get_page = tmem_frontswap_get_page, | 366 | .load = tmem_frontswap_load, |
| 367 | .invalidate_page = tmem_frontswap_flush_page, | 367 | .invalidate_page = tmem_frontswap_flush_page, |
| 368 | .invalidate_area = tmem_frontswap_flush_area, | 368 | .invalidate_area = tmem_frontswap_flush_area, |
| 369 | .init = tmem_frontswap_init | 369 | .init = tmem_frontswap_init |
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 20350a93ed99..6df0cbe1cbc9 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h | |||
| @@ -174,6 +174,7 @@ struct smb_version_operations { | |||
| 174 | void (*add_credits)(struct TCP_Server_Info *, const unsigned int); | 174 | void (*add_credits)(struct TCP_Server_Info *, const unsigned int); |
| 175 | void (*set_credits)(struct TCP_Server_Info *, const int); | 175 | void (*set_credits)(struct TCP_Server_Info *, const int); |
| 176 | int * (*get_credits_field)(struct TCP_Server_Info *); | 176 | int * (*get_credits_field)(struct TCP_Server_Info *); |
| 177 | __u64 (*get_next_mid)(struct TCP_Server_Info *); | ||
| 177 | /* data offset from read response message */ | 178 | /* data offset from read response message */ |
| 178 | unsigned int (*read_data_offset)(char *); | 179 | unsigned int (*read_data_offset)(char *); |
| 179 | /* data length from read response message */ | 180 | /* data length from read response message */ |
| @@ -399,6 +400,12 @@ set_credits(struct TCP_Server_Info *server, const int val) | |||
| 399 | server->ops->set_credits(server, val); | 400 | server->ops->set_credits(server, val); |
| 400 | } | 401 | } |
| 401 | 402 | ||
| 403 | static inline __u64 | ||
| 404 | get_next_mid(struct TCP_Server_Info *server) | ||
| 405 | { | ||
| 406 | return server->ops->get_next_mid(server); | ||
| 407 | } | ||
| 408 | |||
| 402 | /* | 409 | /* |
| 403 | * Macros to allow the TCP_Server_Info->net field and related code to drop out | 410 | * Macros to allow the TCP_Server_Info->net field and related code to drop out |
| 404 | * when CONFIG_NET_NS isn't set. | 411 | * when CONFIG_NET_NS isn't set. |
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 5ec21ecf7980..0a6cbfe2761e 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h | |||
| @@ -114,7 +114,6 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct, | |||
| 114 | void **request_buf); | 114 | void **request_buf); |
| 115 | extern int CIFS_SessSetup(unsigned int xid, struct cifs_ses *ses, | 115 | extern int CIFS_SessSetup(unsigned int xid, struct cifs_ses *ses, |
| 116 | const struct nls_table *nls_cp); | 116 | const struct nls_table *nls_cp); |
| 117 | extern __u64 GetNextMid(struct TCP_Server_Info *server); | ||
| 118 | extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); | 117 | extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); |
| 119 | extern u64 cifs_UnixTimeToNT(struct timespec); | 118 | extern u64 cifs_UnixTimeToNT(struct timespec); |
| 120 | extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, | 119 | extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, |
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index b5ad716b2642..5b400730c213 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c | |||
| @@ -268,7 +268,7 @@ small_smb_init_no_tc(const int smb_command, const int wct, | |||
| 268 | return rc; | 268 | return rc; |
| 269 | 269 | ||
| 270 | buffer = (struct smb_hdr *)*request_buf; | 270 | buffer = (struct smb_hdr *)*request_buf; |
| 271 | buffer->Mid = GetNextMid(ses->server); | 271 | buffer->Mid = get_next_mid(ses->server); |
| 272 | if (ses->capabilities & CAP_UNICODE) | 272 | if (ses->capabilities & CAP_UNICODE) |
| 273 | buffer->Flags2 |= SMBFLG2_UNICODE; | 273 | buffer->Flags2 |= SMBFLG2_UNICODE; |
| 274 | if (ses->capabilities & CAP_STATUS32) | 274 | if (ses->capabilities & CAP_STATUS32) |
| @@ -402,7 +402,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses) | |||
| 402 | 402 | ||
| 403 | cFYI(1, "secFlags 0x%x", secFlags); | 403 | cFYI(1, "secFlags 0x%x", secFlags); |
| 404 | 404 | ||
| 405 | pSMB->hdr.Mid = GetNextMid(server); | 405 | pSMB->hdr.Mid = get_next_mid(server); |
| 406 | pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS); | 406 | pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS); |
| 407 | 407 | ||
| 408 | if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) | 408 | if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) |
| @@ -782,7 +782,7 @@ CIFSSMBLogoff(const int xid, struct cifs_ses *ses) | |||
| 782 | return rc; | 782 | return rc; |
| 783 | } | 783 | } |
| 784 | 784 | ||
| 785 | pSMB->hdr.Mid = GetNextMid(ses->server); | 785 | pSMB->hdr.Mid = get_next_mid(ses->server); |
| 786 | 786 | ||
| 787 | if (ses->server->sec_mode & | 787 | if (ses->server->sec_mode & |
| 788 | (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) | 788 | (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) |
| @@ -4762,7 +4762,7 @@ getDFSRetry: | |||
| 4762 | 4762 | ||
| 4763 | /* server pointer checked in called function, | 4763 | /* server pointer checked in called function, |
| 4764 | but should never be null here anyway */ | 4764 | but should never be null here anyway */ |
| 4765 | pSMB->hdr.Mid = GetNextMid(ses->server); | 4765 | pSMB->hdr.Mid = get_next_mid(ses->server); |
| 4766 | pSMB->hdr.Tid = ses->ipc_tid; | 4766 | pSMB->hdr.Tid = ses->ipc_tid; |
| 4767 | pSMB->hdr.Uid = ses->Suid; | 4767 | pSMB->hdr.Uid = ses->Suid; |
| 4768 | if (ses->capabilities & CAP_STATUS32) | 4768 | if (ses->capabilities & CAP_STATUS32) |
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ccafdedd0dbc..78db68a5cf44 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c | |||
| @@ -1058,13 +1058,15 @@ cifs_demultiplex_thread(void *p) | |||
| 1058 | if (mid_entry != NULL) { | 1058 | if (mid_entry != NULL) { |
| 1059 | if (!mid_entry->multiRsp || mid_entry->multiEnd) | 1059 | if (!mid_entry->multiRsp || mid_entry->multiEnd) |
| 1060 | mid_entry->callback(mid_entry); | 1060 | mid_entry->callback(mid_entry); |
| 1061 | } else if (!server->ops->is_oplock_break(buf, server)) { | 1061 | } else if (!server->ops->is_oplock_break || |
| 1062 | !server->ops->is_oplock_break(buf, server)) { | ||
| 1062 | cERROR(1, "No task to wake, unknown frame received! " | 1063 | cERROR(1, "No task to wake, unknown frame received! " |
| 1063 | "NumMids %d", atomic_read(&midCount)); | 1064 | "NumMids %d", atomic_read(&midCount)); |
| 1064 | cifs_dump_mem("Received Data is: ", buf, | 1065 | cifs_dump_mem("Received Data is: ", buf, |
| 1065 | HEADER_SIZE(server)); | 1066 | HEADER_SIZE(server)); |
| 1066 | #ifdef CONFIG_CIFS_DEBUG2 | 1067 | #ifdef CONFIG_CIFS_DEBUG2 |
| 1067 | server->ops->dump_detail(buf); | 1068 | if (server->ops->dump_detail) |
| 1069 | server->ops->dump_detail(buf); | ||
| 1068 | cifs_dump_mids(server); | 1070 | cifs_dump_mids(server); |
| 1069 | #endif /* CIFS_DEBUG2 */ | 1071 | #endif /* CIFS_DEBUG2 */ |
| 1070 | 1072 | ||
| @@ -3938,7 +3940,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses, | |||
| 3938 | header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX, | 3940 | header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX, |
| 3939 | NULL /*no tid */ , 4 /*wct */ ); | 3941 | NULL /*no tid */ , 4 /*wct */ ); |
| 3940 | 3942 | ||
| 3941 | smb_buffer->Mid = GetNextMid(ses->server); | 3943 | smb_buffer->Mid = get_next_mid(ses->server); |
| 3942 | smb_buffer->Uid = ses->Suid; | 3944 | smb_buffer->Uid = ses->Suid; |
| 3943 | pSMB = (TCONX_REQ *) smb_buffer; | 3945 | pSMB = (TCONX_REQ *) smb_buffer; |
| 3944 | pSMBr = (TCONX_RSP *) smb_buffer_response; | 3946 | pSMBr = (TCONX_RSP *) smb_buffer_response; |
diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 253170dfa716..513adbc211d7 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c | |||
| @@ -876,7 +876,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) | |||
| 876 | struct cifsLockInfo *li, *tmp; | 876 | struct cifsLockInfo *li, *tmp; |
| 877 | struct cifs_tcon *tcon; | 877 | struct cifs_tcon *tcon; |
| 878 | struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); | 878 | struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); |
| 879 | unsigned int num, max_num; | 879 | unsigned int num, max_num, max_buf; |
| 880 | LOCKING_ANDX_RANGE *buf, *cur; | 880 | LOCKING_ANDX_RANGE *buf, *cur; |
| 881 | int types[] = {LOCKING_ANDX_LARGE_FILES, | 881 | int types[] = {LOCKING_ANDX_LARGE_FILES, |
| 882 | LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; | 882 | LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; |
| @@ -892,8 +892,19 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) | |||
| 892 | return rc; | 892 | return rc; |
| 893 | } | 893 | } |
| 894 | 894 | ||
| 895 | max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) / | 895 | /* |
| 896 | sizeof(LOCKING_ANDX_RANGE); | 896 | * Accessing maxBuf is racy with cifs_reconnect - need to store value |
| 897 | * and check it for zero before using. | ||
| 898 | */ | ||
| 899 | max_buf = tcon->ses->server->maxBuf; | ||
| 900 | if (!max_buf) { | ||
| 901 | mutex_unlock(&cinode->lock_mutex); | ||
| 902 | FreeXid(xid); | ||
| 903 | return -EINVAL; | ||
| 904 | } | ||
| 905 | |||
| 906 | max_num = (max_buf - sizeof(struct smb_hdr)) / | ||
| 907 | sizeof(LOCKING_ANDX_RANGE); | ||
| 897 | buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); | 908 | buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); |
| 898 | if (!buf) { | 909 | if (!buf) { |
| 899 | mutex_unlock(&cinode->lock_mutex); | 910 | mutex_unlock(&cinode->lock_mutex); |
| @@ -1218,7 +1229,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid) | |||
| 1218 | int types[] = {LOCKING_ANDX_LARGE_FILES, | 1229 | int types[] = {LOCKING_ANDX_LARGE_FILES, |
| 1219 | LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; | 1230 | LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; |
| 1220 | unsigned int i; | 1231 | unsigned int i; |
| 1221 | unsigned int max_num, num; | 1232 | unsigned int max_num, num, max_buf; |
| 1222 | LOCKING_ANDX_RANGE *buf, *cur; | 1233 | LOCKING_ANDX_RANGE *buf, *cur; |
| 1223 | struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); | 1234 | struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); |
| 1224 | struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); | 1235 | struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); |
| @@ -1228,8 +1239,16 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid) | |||
| 1228 | 1239 | ||
| 1229 | INIT_LIST_HEAD(&tmp_llist); | 1240 | INIT_LIST_HEAD(&tmp_llist); |
| 1230 | 1241 | ||
| 1231 | max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) / | 1242 | /* |
| 1232 | sizeof(LOCKING_ANDX_RANGE); | 1243 | * Accessing maxBuf is racy with cifs_reconnect - need to store value |
| 1244 | * and check it for zero before using. | ||
| 1245 | */ | ||
| 1246 | max_buf = tcon->ses->server->maxBuf; | ||
| 1247 | if (!max_buf) | ||
| 1248 | return -EINVAL; | ||
| 1249 | |||
| 1250 | max_num = (max_buf - sizeof(struct smb_hdr)) / | ||
| 1251 | sizeof(LOCKING_ANDX_RANGE); | ||
| 1233 | buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); | 1252 | buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); |
| 1234 | if (!buf) | 1253 | if (!buf) |
| 1235 | return -ENOMEM; | 1254 | return -ENOMEM; |
| @@ -1247,46 +1266,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid) | |||
| 1247 | continue; | 1266 | continue; |
| 1248 | if (types[i] != li->type) | 1267 | if (types[i] != li->type) |
| 1249 | continue; | 1268 | continue; |
| 1250 | if (!cinode->can_cache_brlcks) { | 1269 | if (cinode->can_cache_brlcks) { |
| 1251 | cur->Pid = cpu_to_le16(li->pid); | ||
| 1252 | cur->LengthLow = cpu_to_le32((u32)li->length); | ||
| 1253 | cur->LengthHigh = | ||
| 1254 | cpu_to_le32((u32)(li->length>>32)); | ||
| 1255 | cur->OffsetLow = cpu_to_le32((u32)li->offset); | ||
| 1256 | cur->OffsetHigh = | ||
| 1257 | cpu_to_le32((u32)(li->offset>>32)); | ||
| 1258 | /* | ||
| 1259 | * We need to save a lock here to let us add | ||
| 1260 | * it again to the file's list if the unlock | ||
| 1261 | * range request fails on the server. | ||
| 1262 | */ | ||
| 1263 | list_move(&li->llist, &tmp_llist); | ||
| 1264 | if (++num == max_num) { | ||
| 1265 | stored_rc = cifs_lockv(xid, tcon, | ||
| 1266 | cfile->netfid, | ||
| 1267 | li->type, num, | ||
| 1268 | 0, buf); | ||
| 1269 | if (stored_rc) { | ||
| 1270 | /* | ||
| 1271 | * We failed on the unlock range | ||
| 1272 | * request - add all locks from | ||
| 1273 | * the tmp list to the head of | ||
| 1274 | * the file's list. | ||
| 1275 | */ | ||
| 1276 | cifs_move_llist(&tmp_llist, | ||
| 1277 | &cfile->llist); | ||
| 1278 | rc = stored_rc; | ||
| 1279 | } else | ||
| 1280 | /* | ||
| 1281 | * The unlock range request | ||
| 1282 | * succeed - free the tmp list. | ||
| 1283 | */ | ||
| 1284 | cifs_free_llist(&tmp_llist); | ||
| 1285 | cur = buf; | ||
| 1286 | num = 0; | ||
| 1287 | } else | ||
| 1288 | cur++; | ||
| 1289 | } else { | ||
| 1290 | /* | 1270 | /* |
| 1291 | * We can cache brlock requests - simply remove | 1271 | * We can cache brlock requests - simply remove |
| 1292 | * a lock from the file's list. | 1272 | * a lock from the file's list. |
| @@ -1294,7 +1274,41 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid) | |||
| 1294 | list_del(&li->llist); | 1274 | list_del(&li->llist); |
| 1295 | cifs_del_lock_waiters(li); | 1275 | cifs_del_lock_waiters(li); |
| 1296 | kfree(li); | 1276 | kfree(li); |
| 1277 | continue; | ||
| 1297 | } | 1278 | } |
| 1279 | cur->Pid = cpu_to_le16(li->pid); | ||
| 1280 | cur->LengthLow = cpu_to_le32((u32)li->length); | ||
| 1281 | cur->LengthHigh = cpu_to_le32((u32)(li->length>>32)); | ||
| 1282 | cur->OffsetLow = cpu_to_le32((u32)li->offset); | ||
| 1283 | cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32)); | ||
| 1284 | /* | ||
| 1285 | * We need to save a lock here to let us add it again to | ||
| 1286 | * the file's list if the unlock range request fails on | ||
| 1287 | * the server. | ||
| 1288 | */ | ||
| 1289 | list_move(&li->llist, &tmp_llist); | ||
| 1290 | if (++num == max_num) { | ||
| 1291 | stored_rc = cifs_lockv(xid, tcon, cfile->netfid, | ||
| 1292 | li->type, num, 0, buf); | ||
| 1293 | if (stored_rc) { | ||
| 1294 | /* | ||
| 1295 | * We failed on the unlock range | ||
| 1296 | * request - add all locks from the tmp | ||
| 1297 | * list to the head of the file's list. | ||
| 1298 | */ | ||
| 1299 | cifs_move_llist(&tmp_llist, | ||
| 1300 | &cfile->llist); | ||
| 1301 | rc = stored_rc; | ||
| 1302 | } else | ||
| 1303 | /* | ||
| 1304 | * The unlock range request succeed - | ||
| 1305 | * free the tmp list. | ||
| 1306 | */ | ||
| 1307 | cifs_free_llist(&tmp_llist); | ||
| 1308 | cur = buf; | ||
| 1309 | num = 0; | ||
| 1310 | } else | ||
| 1311 | cur++; | ||
| 1298 | } | 1312 | } |
| 1299 | if (num) { | 1313 | if (num) { |
| 1300 | stored_rc = cifs_lockv(xid, tcon, cfile->netfid, | 1314 | stored_rc = cifs_lockv(xid, tcon, cfile->netfid, |
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index e2552d2b2e42..557506ae1e2a 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c | |||
| @@ -212,93 +212,6 @@ cifs_small_buf_release(void *buf_to_free) | |||
| 212 | return; | 212 | return; |
| 213 | } | 213 | } |
| 214 | 214 | ||
| 215 | /* | ||
| 216 | * Find a free multiplex id (SMB mid). Otherwise there could be | ||
| 217 | * mid collisions which might cause problems, demultiplexing the | ||
| 218 | * wrong response to this request. Multiplex ids could collide if | ||
| 219 | * one of a series requests takes much longer than the others, or | ||
| 220 | * if a very large number of long lived requests (byte range | ||
| 221 | * locks or FindNotify requests) are pending. No more than | ||
| 222 | * 64K-1 requests can be outstanding at one time. If no | ||
| 223 | * mids are available, return zero. A future optimization | ||
| 224 | * could make the combination of mids and uid the key we use | ||
| 225 | * to demultiplex on (rather than mid alone). | ||
| 226 | * In addition to the above check, the cifs demultiplex | ||
| 227 | * code already used the command code as a secondary | ||
| 228 | * check of the frame and if signing is negotiated the | ||
| 229 | * response would be discarded if the mid were the same | ||
| 230 | * but the signature was wrong. Since the mid is not put in the | ||
| 231 | * pending queue until later (when it is about to be dispatched) | ||
| 232 | * we do have to limit the number of outstanding requests | ||
| 233 | * to somewhat less than 64K-1 although it is hard to imagine | ||
| 234 | * so many threads being in the vfs at one time. | ||
| 235 | */ | ||
| 236 | __u64 GetNextMid(struct TCP_Server_Info *server) | ||
| 237 | { | ||
| 238 | __u64 mid = 0; | ||
| 239 | __u16 last_mid, cur_mid; | ||
| 240 | bool collision; | ||
| 241 | |||
| 242 | spin_lock(&GlobalMid_Lock); | ||
| 243 | |||
| 244 | /* mid is 16 bit only for CIFS/SMB */ | ||
| 245 | cur_mid = (__u16)((server->CurrentMid) & 0xffff); | ||
| 246 | /* we do not want to loop forever */ | ||
| 247 | last_mid = cur_mid; | ||
| 248 | cur_mid++; | ||
| 249 | |||
| 250 | /* | ||
| 251 | * This nested loop looks more expensive than it is. | ||
| 252 | * In practice the list of pending requests is short, | ||
| 253 | * fewer than 50, and the mids are likely to be unique | ||
| 254 | * on the first pass through the loop unless some request | ||
| 255 | * takes longer than the 64 thousand requests before it | ||
| 256 | * (and it would also have to have been a request that | ||
| 257 | * did not time out). | ||
| 258 | */ | ||
| 259 | while (cur_mid != last_mid) { | ||
| 260 | struct mid_q_entry *mid_entry; | ||
| 261 | unsigned int num_mids; | ||
| 262 | |||
| 263 | collision = false; | ||
| 264 | if (cur_mid == 0) | ||
| 265 | cur_mid++; | ||
| 266 | |||
| 267 | num_mids = 0; | ||
| 268 | list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) { | ||
| 269 | ++num_mids; | ||
| 270 | if (mid_entry->mid == cur_mid && | ||
| 271 | mid_entry->mid_state == MID_REQUEST_SUBMITTED) { | ||
| 272 | /* This mid is in use, try a different one */ | ||
| 273 | collision = true; | ||
| 274 | break; | ||
| 275 | } | ||
| 276 | } | ||
| 277 | |||
| 278 | /* | ||
| 279 | * if we have more than 32k mids in the list, then something | ||
| 280 | * is very wrong. Possibly a local user is trying to DoS the | ||
| 281 | * box by issuing long-running calls and SIGKILL'ing them. If | ||
| 282 | * we get to 2^16 mids then we're in big trouble as this | ||
| 283 | * function could loop forever. | ||
| 284 | * | ||
| 285 | * Go ahead and assign out the mid in this situation, but force | ||
| 286 | * an eventual reconnect to clean out the pending_mid_q. | ||
| 287 | */ | ||
| 288 | if (num_mids > 32768) | ||
| 289 | server->tcpStatus = CifsNeedReconnect; | ||
| 290 | |||
| 291 | if (!collision) { | ||
| 292 | mid = (__u64)cur_mid; | ||
| 293 | server->CurrentMid = mid; | ||
| 294 | break; | ||
| 295 | } | ||
| 296 | cur_mid++; | ||
| 297 | } | ||
| 298 | spin_unlock(&GlobalMid_Lock); | ||
| 299 | return mid; | ||
| 300 | } | ||
| 301 | |||
| 302 | /* NB: MID can not be set if treeCon not passed in, in that | 215 | /* NB: MID can not be set if treeCon not passed in, in that |
| 303 | case it is responsbility of caller to set the mid */ | 216 | case it is responsbility of caller to set the mid */ |
| 304 | void | 217 | void |
| @@ -334,7 +247,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ , | |||
| 334 | 247 | ||
| 335 | /* Uid is not converted */ | 248 | /* Uid is not converted */ |
| 336 | buffer->Uid = treeCon->ses->Suid; | 249 | buffer->Uid = treeCon->ses->Suid; |
| 337 | buffer->Mid = GetNextMid(treeCon->ses->server); | 250 | buffer->Mid = get_next_mid(treeCon->ses->server); |
| 338 | } | 251 | } |
| 339 | if (treeCon->Flags & SMB_SHARE_IS_IN_DFS) | 252 | if (treeCon->Flags & SMB_SHARE_IS_IN_DFS) |
| 340 | buffer->Flags2 |= SMBFLG2_DFS; | 253 | buffer->Flags2 |= SMBFLG2_DFS; |
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index d9d615fbed3f..6dec38f5522d 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c | |||
| @@ -125,6 +125,94 @@ cifs_get_credits_field(struct TCP_Server_Info *server) | |||
| 125 | return &server->credits; | 125 | return &server->credits; |
| 126 | } | 126 | } |
| 127 | 127 | ||
| 128 | /* | ||
| 129 | * Find a free multiplex id (SMB mid). Otherwise there could be | ||
| 130 | * mid collisions which might cause problems, demultiplexing the | ||
| 131 | * wrong response to this request. Multiplex ids could collide if | ||
| 132 | * one of a series requests takes much longer than the others, or | ||
| 133 | * if a very large number of long lived requests (byte range | ||
| 134 | * locks or FindNotify requests) are pending. No more than | ||
| 135 | * 64K-1 requests can be outstanding at one time. If no | ||
| 136 | * mids are available, return zero. A future optimization | ||
| 137 | * could make the combination of mids and uid the key we use | ||
| 138 | * to demultiplex on (rather than mid alone). | ||
| 139 | * In addition to the above check, the cifs demultiplex | ||
| 140 | * code already used the command code as a secondary | ||
| 141 | * check of the frame and if signing is negotiated the | ||
| 142 | * response would be discarded if the mid were the same | ||
| 143 | * but the signature was wrong. Since the mid is not put in the | ||
| 144 | * pending queue until later (when it is about to be dispatched) | ||
| 145 | * we do have to limit the number of outstanding requests | ||
| 146 | * to somewhat less than 64K-1 although it is hard to imagine | ||
| 147 | * so many threads being in the vfs at one time. | ||
| 148 | */ | ||
| 149 | static __u64 | ||
| 150 | cifs_get_next_mid(struct TCP_Server_Info *server) | ||
| 151 | { | ||
| 152 | __u64 mid = 0; | ||
| 153 | __u16 last_mid, cur_mid; | ||
| 154 | bool collision; | ||
| 155 | |||
| 156 | spin_lock(&GlobalMid_Lock); | ||
| 157 | |||
| 158 | /* mid is 16 bit only for CIFS/SMB */ | ||
| 159 | cur_mid = (__u16)((server->CurrentMid) & 0xffff); | ||
| 160 | /* we do not want to loop forever */ | ||
| 161 | last_mid = cur_mid; | ||
| 162 | cur_mid++; | ||
| 163 | |||
| 164 | /* | ||
| 165 | * This nested loop looks more expensive than it is. | ||
| 166 | * In practice the list of pending requests is short, | ||
| 167 | * fewer than 50, and the mids are likely to be unique | ||
| 168 | * on the first pass through the loop unless some request | ||
| 169 | * takes longer than the 64 thousand requests before it | ||
| 170 | * (and it would also have to have been a request that | ||
| 171 | * did not time out). | ||
| 172 | */ | ||
| 173 | while (cur_mid != last_mid) { | ||
| 174 | struct mid_q_entry *mid_entry; | ||
| 175 | unsigned int num_mids; | ||
| 176 | |||
| 177 | collision = false; | ||
| 178 | if (cur_mid == 0) | ||
| 179 | cur_mid++; | ||
| 180 | |||
| 181 | num_mids = 0; | ||
| 182 | list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) { | ||
| 183 | ++num_mids; | ||
| 184 | if (mid_entry->mid == cur_mid && | ||
| 185 | mid_entry->mid_state == MID_REQUEST_SUBMITTED) { | ||
| 186 | /* This mid is in use, try a different one */ | ||
| 187 | collision = true; | ||
| 188 | break; | ||
| 189 | } | ||
| 190 | } | ||
| 191 | |||
| 192 | /* | ||
| 193 | * if we have more than 32k mids in the list, then something | ||
| 194 | * is very wrong. Possibly a local user is trying to DoS the | ||
| 195 | * box by issuing long-running calls and SIGKILL'ing them. If | ||
| 196 | * we get to 2^16 mids then we're in big trouble as this | ||
| 197 | * function could loop forever. | ||
| 198 | * | ||
| 199 | * Go ahead and assign out the mid in this situation, but force | ||
| 200 | * an eventual reconnect to clean out the pending_mid_q. | ||
| 201 | */ | ||
| 202 | if (num_mids > 32768) | ||
| 203 | server->tcpStatus = CifsNeedReconnect; | ||
| 204 | |||
| 205 | if (!collision) { | ||
| 206 | mid = (__u64)cur_mid; | ||
| 207 | server->CurrentMid = mid; | ||
| 208 | break; | ||
| 209 | } | ||
| 210 | cur_mid++; | ||
| 211 | } | ||
| 212 | spin_unlock(&GlobalMid_Lock); | ||
| 213 | return mid; | ||
| 214 | } | ||
| 215 | |||
| 128 | struct smb_version_operations smb1_operations = { | 216 | struct smb_version_operations smb1_operations = { |
| 129 | .send_cancel = send_nt_cancel, | 217 | .send_cancel = send_nt_cancel, |
| 130 | .compare_fids = cifs_compare_fids, | 218 | .compare_fids = cifs_compare_fids, |
| @@ -133,6 +221,7 @@ struct smb_version_operations smb1_operations = { | |||
| 133 | .add_credits = cifs_add_credits, | 221 | .add_credits = cifs_add_credits, |
| 134 | .set_credits = cifs_set_credits, | 222 | .set_credits = cifs_set_credits, |
| 135 | .get_credits_field = cifs_get_credits_field, | 223 | .get_credits_field = cifs_get_credits_field, |
| 224 | .get_next_mid = cifs_get_next_mid, | ||
| 136 | .read_data_offset = cifs_read_data_offset, | 225 | .read_data_offset = cifs_read_data_offset, |
| 137 | .read_data_length = cifs_read_data_length, | 226 | .read_data_length = cifs_read_data_length, |
| 138 | .map_error = map_smb_to_linux_error, | 227 | .map_error = map_smb_to_linux_error, |
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 1b36ffe6a47b..3097ee58fd7d 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c | |||
| @@ -779,7 +779,7 @@ send_lock_cancel(const unsigned int xid, struct cifs_tcon *tcon, | |||
| 779 | 779 | ||
| 780 | pSMB->LockType = LOCKING_ANDX_CANCEL_LOCK|LOCKING_ANDX_LARGE_FILES; | 780 | pSMB->LockType = LOCKING_ANDX_CANCEL_LOCK|LOCKING_ANDX_LARGE_FILES; |
| 781 | pSMB->Timeout = 0; | 781 | pSMB->Timeout = 0; |
| 782 | pSMB->hdr.Mid = GetNextMid(ses->server); | 782 | pSMB->hdr.Mid = get_next_mid(ses->server); |
| 783 | 783 | ||
| 784 | return SendReceive(xid, ses, in_buf, out_buf, | 784 | return SendReceive(xid, ses, in_buf, out_buf, |
| 785 | &bytes_returned, 0); | 785 | &bytes_returned, 0); |
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h new file mode 100644 index 000000000000..0e4e2eec5c1d --- /dev/null +++ b/include/linux/frontswap.h | |||
| @@ -0,0 +1,127 @@ | |||
| 1 | #ifndef _LINUX_FRONTSWAP_H | ||
| 2 | #define _LINUX_FRONTSWAP_H | ||
| 3 | |||
| 4 | #include <linux/swap.h> | ||
| 5 | #include <linux/mm.h> | ||
| 6 | #include <linux/bitops.h> | ||
| 7 | |||
| 8 | struct frontswap_ops { | ||
| 9 | void (*init)(unsigned); | ||
| 10 | int (*store)(unsigned, pgoff_t, struct page *); | ||
| 11 | int (*load)(unsigned, pgoff_t, struct page *); | ||
| 12 | void (*invalidate_page)(unsigned, pgoff_t); | ||
| 13 | void (*invalidate_area)(unsigned); | ||
| 14 | }; | ||
| 15 | |||
| 16 | extern bool frontswap_enabled; | ||
| 17 | extern struct frontswap_ops | ||
| 18 | frontswap_register_ops(struct frontswap_ops *ops); | ||
| 19 | extern void frontswap_shrink(unsigned long); | ||
| 20 | extern unsigned long frontswap_curr_pages(void); | ||
| 21 | extern void frontswap_writethrough(bool); | ||
| 22 | |||
| 23 | extern void __frontswap_init(unsigned type); | ||
| 24 | extern int __frontswap_store(struct page *page); | ||
| 25 | extern int __frontswap_load(struct page *page); | ||
| 26 | extern void __frontswap_invalidate_page(unsigned, pgoff_t); | ||
| 27 | extern void __frontswap_invalidate_area(unsigned); | ||
| 28 | |||
| 29 | #ifdef CONFIG_FRONTSWAP | ||
| 30 | |||
| 31 | static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) | ||
| 32 | { | ||
| 33 | bool ret = false; | ||
| 34 | |||
| 35 | if (frontswap_enabled && sis->frontswap_map) | ||
| 36 | ret = test_bit(offset, sis->frontswap_map); | ||
| 37 | return ret; | ||
| 38 | } | ||
| 39 | |||
| 40 | static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset) | ||
| 41 | { | ||
| 42 | if (frontswap_enabled && sis->frontswap_map) | ||
| 43 | set_bit(offset, sis->frontswap_map); | ||
| 44 | } | ||
| 45 | |||
| 46 | static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset) | ||
| 47 | { | ||
| 48 | if (frontswap_enabled && sis->frontswap_map) | ||
| 49 | clear_bit(offset, sis->frontswap_map); | ||
| 50 | } | ||
| 51 | |||
| 52 | static inline void frontswap_map_set(struct swap_info_struct *p, | ||
| 53 | unsigned long *map) | ||
| 54 | { | ||
| 55 | p->frontswap_map = map; | ||
| 56 | } | ||
| 57 | |||
| 58 | static inline unsigned long *frontswap_map_get(struct swap_info_struct *p) | ||
| 59 | { | ||
| 60 | return p->frontswap_map; | ||
| 61 | } | ||
| 62 | #else | ||
| 63 | /* all inline routines become no-ops and all externs are ignored */ | ||
| 64 | |||
| 65 | #define frontswap_enabled (0) | ||
| 66 | |||
| 67 | static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) | ||
| 68 | { | ||
| 69 | return false; | ||
| 70 | } | ||
| 71 | |||
| 72 | static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset) | ||
| 73 | { | ||
| 74 | } | ||
| 75 | |||
| 76 | static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset) | ||
| 77 | { | ||
| 78 | } | ||
| 79 | |||
| 80 | static inline void frontswap_map_set(struct swap_info_struct *p, | ||
| 81 | unsigned long *map) | ||
| 82 | { | ||
| 83 | } | ||
| 84 | |||
| 85 | static inline unsigned long *frontswap_map_get(struct swap_info_struct *p) | ||
| 86 | { | ||
| 87 | return NULL; | ||
| 88 | } | ||
| 89 | #endif | ||
| 90 | |||
| 91 | static inline int frontswap_store(struct page *page) | ||
| 92 | { | ||
| 93 | int ret = -1; | ||
| 94 | |||
| 95 | if (frontswap_enabled) | ||
| 96 | ret = __frontswap_store(page); | ||
| 97 | return ret; | ||
| 98 | } | ||
| 99 | |||
| 100 | static inline int frontswap_load(struct page *page) | ||
| 101 | { | ||
| 102 | int ret = -1; | ||
| 103 | |||
| 104 | if (frontswap_enabled) | ||
| 105 | ret = __frontswap_load(page); | ||
| 106 | return ret; | ||
| 107 | } | ||
| 108 | |||
| 109 | static inline void frontswap_invalidate_page(unsigned type, pgoff_t offset) | ||
| 110 | { | ||
| 111 | if (frontswap_enabled) | ||
| 112 | __frontswap_invalidate_page(type, offset); | ||
| 113 | } | ||
| 114 | |||
| 115 | static inline void frontswap_invalidate_area(unsigned type) | ||
| 116 | { | ||
| 117 | if (frontswap_enabled) | ||
| 118 | __frontswap_invalidate_area(type); | ||
| 119 | } | ||
| 120 | |||
| 121 | static inline void frontswap_init(unsigned type) | ||
| 122 | { | ||
| 123 | if (frontswap_enabled) | ||
| 124 | __frontswap_init(type); | ||
| 125 | } | ||
| 126 | |||
| 127 | #endif /* _LINUX_FRONTSWAP_H */ | ||
diff --git a/include/linux/swap.h b/include/linux/swap.h index b6661933e252..c84ec68eaec9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
| @@ -197,6 +197,10 @@ struct swap_info_struct { | |||
| 197 | struct block_device *bdev; /* swap device or bdev of swap file */ | 197 | struct block_device *bdev; /* swap device or bdev of swap file */ |
| 198 | struct file *swap_file; /* seldom referenced */ | 198 | struct file *swap_file; /* seldom referenced */ |
| 199 | unsigned int old_block_size; /* seldom referenced */ | 199 | unsigned int old_block_size; /* seldom referenced */ |
| 200 | #ifdef CONFIG_FRONTSWAP | ||
| 201 | unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ | ||
| 202 | atomic_t frontswap_pages; /* frontswap pages in-use counter */ | ||
| 203 | #endif | ||
| 200 | }; | 204 | }; |
| 201 | 205 | ||
| 202 | struct swap_list_t { | 206 | struct swap_list_t { |
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h new file mode 100644 index 000000000000..e282624e8c10 --- /dev/null +++ b/include/linux/swapfile.h | |||
| @@ -0,0 +1,13 @@ | |||
| 1 | #ifndef _LINUX_SWAPFILE_H | ||
| 2 | #define _LINUX_SWAPFILE_H | ||
| 3 | |||
| 4 | /* | ||
| 5 | * these were static in swapfile.c but frontswap.c needs them and we don't | ||
| 6 | * want to expose them to the dozens of source files that include swap.h | ||
| 7 | */ | ||
| 8 | extern spinlock_t swap_lock; | ||
| 9 | extern struct swap_list_t swap_list; | ||
| 10 | extern struct swap_info_struct *swap_info[]; | ||
| 11 | extern int try_to_unuse(unsigned int, bool, unsigned long); | ||
| 12 | |||
| 13 | #endif /* _LINUX_SWAPFILE_H */ | ||
diff --git a/mm/Kconfig b/mm/Kconfig index b2176374b98e..82fed4eb2b6f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -389,3 +389,20 @@ config CLEANCACHE | |||
| 389 | in a negligible performance hit. | 389 | in a negligible performance hit. |
| 390 | 390 | ||
| 391 | If unsure, say Y to enable cleancache | 391 | If unsure, say Y to enable cleancache |
| 392 | |||
| 393 | config FRONTSWAP | ||
| 394 | bool "Enable frontswap to cache swap pages if tmem is present" | ||
| 395 | depends on SWAP | ||
| 396 | default n | ||
| 397 | help | ||
| 398 | Frontswap is so named because it can be thought of as the opposite | ||
| 399 | of a "backing" store for a swap device. The data is stored into | ||
| 400 | "transcendent memory", memory that is not directly accessible or | ||
| 401 | addressable by the kernel and is of unknown and possibly | ||
| 402 | time-varying size. When space in transcendent memory is available, | ||
| 403 | a significant swap I/O reduction may be achieved. When none is | ||
| 404 | available, all frontswap calls are reduced to a single pointer- | ||
| 405 | compare-against-NULL resulting in a negligible performance hit | ||
| 406 | and swap data is stored as normal on the matching swap device. | ||
| 407 | |||
| 408 | If unsure, say Y to enable frontswap. | ||
diff --git a/mm/Makefile b/mm/Makefile index a156285ce88d..2e2fbbefb99f 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -29,6 +29,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o | |||
| 29 | 29 | ||
| 30 | obj-$(CONFIG_BOUNCE) += bounce.o | 30 | obj-$(CONFIG_BOUNCE) += bounce.o |
| 31 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o | 31 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o |
| 32 | obj-$(CONFIG_FRONTSWAP) += frontswap.o | ||
| 32 | obj-$(CONFIG_HAS_DMA) += dmapool.o | 33 | obj-$(CONFIG_HAS_DMA) += dmapool.o |
| 33 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | 34 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o |
| 34 | obj-$(CONFIG_NUMA) += mempolicy.o | 35 | obj-$(CONFIG_NUMA) += mempolicy.o |
diff --git a/mm/frontswap.c b/mm/frontswap.c new file mode 100644 index 000000000000..e25025574a02 --- /dev/null +++ b/mm/frontswap.c | |||
| @@ -0,0 +1,314 @@ | |||
| 1 | /* | ||
| 2 | * Frontswap frontend | ||
| 3 | * | ||
| 4 | * This code provides the generic "frontend" layer to call a matching | ||
| 5 | * "backend" driver implementation of frontswap. See | ||
| 6 | * Documentation/vm/frontswap.txt for more information. | ||
| 7 | * | ||
| 8 | * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. | ||
| 9 | * Author: Dan Magenheimer | ||
| 10 | * | ||
| 11 | * This work is licensed under the terms of the GNU GPL, version 2. | ||
| 12 | */ | ||
| 13 | |||
| 14 | #include <linux/mm.h> | ||
| 15 | #include <linux/mman.h> | ||
| 16 | #include <linux/swap.h> | ||
| 17 | #include <linux/swapops.h> | ||
| 18 | #include <linux/proc_fs.h> | ||
| 19 | #include <linux/security.h> | ||
| 20 | #include <linux/capability.h> | ||
| 21 | #include <linux/module.h> | ||
| 22 | #include <linux/uaccess.h> | ||
| 23 | #include <linux/debugfs.h> | ||
| 24 | #include <linux/frontswap.h> | ||
| 25 | #include <linux/swapfile.h> | ||
| 26 | |||
| 27 | /* | ||
| 28 | * frontswap_ops is set by frontswap_register_ops to contain the pointers | ||
| 29 | * to the frontswap "backend" implementation functions. | ||
| 30 | */ | ||
| 31 | static struct frontswap_ops frontswap_ops __read_mostly; | ||
| 32 | |||
| 33 | /* | ||
| 34 | * This global enablement flag reduces overhead on systems where frontswap_ops | ||
| 35 | * has not been registered, so is preferred to the slower alternative: a | ||
| 36 | * function call that checks a non-global. | ||
| 37 | */ | ||
| 38 | bool frontswap_enabled __read_mostly; | ||
| 39 | EXPORT_SYMBOL(frontswap_enabled); | ||
| 40 | |||
| 41 | /* | ||
| 42 | * If enabled, frontswap_store will return failure even on success. As | ||
| 43 | * a result, the swap subsystem will always write the page to swap, in | ||
| 44 | * effect converting frontswap into a writethrough cache. In this mode, | ||
| 45 | * there is no direct reduction in swap writes, but a frontswap backend | ||
| 46 | * can unilaterally "reclaim" any pages in use with no data loss, thus | ||
| 47 | * providing increases control over maximum memory usage due to frontswap. | ||
| 48 | */ | ||
| 49 | static bool frontswap_writethrough_enabled __read_mostly; | ||
| 50 | |||
| 51 | #ifdef CONFIG_DEBUG_FS | ||
| 52 | /* | ||
| 53 | * Counters available via /sys/kernel/debug/frontswap (if debugfs is | ||
| 54 | * properly configured). These are for information only so are not protected | ||
| 55 | * against increment races. | ||
| 56 | */ | ||
| 57 | static u64 frontswap_loads; | ||
| 58 | static u64 frontswap_succ_stores; | ||
| 59 | static u64 frontswap_failed_stores; | ||
| 60 | static u64 frontswap_invalidates; | ||
| 61 | |||
| 62 | static inline void inc_frontswap_loads(void) { | ||
| 63 | frontswap_loads++; | ||
| 64 | } | ||
| 65 | static inline void inc_frontswap_succ_stores(void) { | ||
| 66 | frontswap_succ_stores++; | ||
| 67 | } | ||
| 68 | static inline void inc_frontswap_failed_stores(void) { | ||
| 69 | frontswap_failed_stores++; | ||
| 70 | } | ||
| 71 | static inline void inc_frontswap_invalidates(void) { | ||
| 72 | frontswap_invalidates++; | ||
| 73 | } | ||
| 74 | #else | ||
| 75 | static inline void inc_frontswap_loads(void) { } | ||
| 76 | static inline void inc_frontswap_succ_stores(void) { } | ||
| 77 | static inline void inc_frontswap_failed_stores(void) { } | ||
| 78 | static inline void inc_frontswap_invalidates(void) { } | ||
| 79 | #endif | ||
| 80 | /* | ||
| 81 | * Register operations for frontswap, returning previous thus allowing | ||
| 82 | * detection of multiple backends and possible nesting. | ||
| 83 | */ | ||
| 84 | struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops) | ||
| 85 | { | ||
| 86 | struct frontswap_ops old = frontswap_ops; | ||
| 87 | |||
| 88 | frontswap_ops = *ops; | ||
| 89 | frontswap_enabled = true; | ||
| 90 | return old; | ||
| 91 | } | ||
| 92 | EXPORT_SYMBOL(frontswap_register_ops); | ||
| 93 | |||
| 94 | /* | ||
| 95 | * Enable/disable frontswap writethrough (see above). | ||
| 96 | */ | ||
| 97 | void frontswap_writethrough(bool enable) | ||
| 98 | { | ||
| 99 | frontswap_writethrough_enabled = enable; | ||
| 100 | } | ||
| 101 | EXPORT_SYMBOL(frontswap_writethrough); | ||
| 102 | |||
| 103 | /* | ||
| 104 | * Called when a swap device is swapon'd. | ||
| 105 | */ | ||
| 106 | void __frontswap_init(unsigned type) | ||
| 107 | { | ||
| 108 | struct swap_info_struct *sis = swap_info[type]; | ||
| 109 | |||
| 110 | BUG_ON(sis == NULL); | ||
| 111 | if (sis->frontswap_map == NULL) | ||
| 112 | return; | ||
| 113 | if (frontswap_enabled) | ||
| 114 | (*frontswap_ops.init)(type); | ||
| 115 | } | ||
| 116 | EXPORT_SYMBOL(__frontswap_init); | ||
| 117 | |||
| 118 | /* | ||
| 119 | * "Store" data from a page to frontswap and associate it with the page's | ||
| 120 | * swaptype and offset. Page must be locked and in the swap cache. | ||
| 121 | * If frontswap already contains a page with matching swaptype and | ||
| 122 | * offset, the frontswap implmentation may either overwrite the data and | ||
| 123 | * return success or invalidate the page from frontswap and return failure. | ||
| 124 | */ | ||
| 125 | int __frontswap_store(struct page *page) | ||
| 126 | { | ||
| 127 | int ret = -1, dup = 0; | ||
| 128 | swp_entry_t entry = { .val = page_private(page), }; | ||
| 129 | int type = swp_type(entry); | ||
| 130 | struct swap_info_struct *sis = swap_info[type]; | ||
| 131 | pgoff_t offset = swp_offset(entry); | ||
| 132 | |||
| 133 | BUG_ON(!PageLocked(page)); | ||
| 134 | BUG_ON(sis == NULL); | ||
| 135 | if (frontswap_test(sis, offset)) | ||
| 136 | dup = 1; | ||
| 137 | ret = (*frontswap_ops.store)(type, offset, page); | ||
| 138 | if (ret == 0) { | ||
| 139 | frontswap_set(sis, offset); | ||
| 140 | inc_frontswap_succ_stores(); | ||
| 141 | if (!dup) | ||
| 142 | atomic_inc(&sis->frontswap_pages); | ||
| 143 | } else if (dup) { | ||
| 144 | /* | ||
| 145 | failed dup always results in automatic invalidate of | ||
| 146 | the (older) page from frontswap | ||
| 147 | */ | ||
| 148 | frontswap_clear(sis, offset); | ||
| 149 | atomic_dec(&sis->frontswap_pages); | ||
| 150 | inc_frontswap_failed_stores(); | ||
| 151 | } else | ||
| 152 | inc_frontswap_failed_stores(); | ||
| 153 | if (frontswap_writethrough_enabled) | ||
| 154 | /* report failure so swap also writes to swap device */ | ||
| 155 | ret = -1; | ||
| 156 | return ret; | ||
| 157 | } | ||
| 158 | EXPORT_SYMBOL(__frontswap_store); | ||
| 159 | |||
| 160 | /* | ||
| 161 | * "Get" data from frontswap associated with swaptype and offset that were | ||
| 162 | * specified when the data was put to frontswap and use it to fill the | ||
| 163 | * specified page with data. Page must be locked and in the swap cache. | ||
| 164 | */ | ||
| 165 | int __frontswap_load(struct page *page) | ||
| 166 | { | ||
| 167 | int ret = -1; | ||
| 168 | swp_entry_t entry = { .val = page_private(page), }; | ||
| 169 | int type = swp_type(entry); | ||
| 170 | struct swap_info_struct *sis = swap_info[type]; | ||
| 171 | pgoff_t offset = swp_offset(entry); | ||
| 172 | |||
| 173 | BUG_ON(!PageLocked(page)); | ||
| 174 | BUG_ON(sis == NULL); | ||
| 175 | if (frontswap_test(sis, offset)) | ||
| 176 | ret = (*frontswap_ops.load)(type, offset, page); | ||
| 177 | if (ret == 0) | ||
| 178 | inc_frontswap_loads(); | ||
| 179 | return ret; | ||
| 180 | } | ||
| 181 | EXPORT_SYMBOL(__frontswap_load); | ||
| 182 | |||
| 183 | /* | ||
| 184 | * Invalidate any data from frontswap associated with the specified swaptype | ||
| 185 | * and offset so that a subsequent "get" will fail. | ||
| 186 | */ | ||
| 187 | void __frontswap_invalidate_page(unsigned type, pgoff_t offset) | ||
| 188 | { | ||
| 189 | struct swap_info_struct *sis = swap_info[type]; | ||
| 190 | |||
| 191 | BUG_ON(sis == NULL); | ||
| 192 | if (frontswap_test(sis, offset)) { | ||
| 193 | (*frontswap_ops.invalidate_page)(type, offset); | ||
| 194 | atomic_dec(&sis->frontswap_pages); | ||
| 195 | frontswap_clear(sis, offset); | ||
| 196 | inc_frontswap_invalidates(); | ||
| 197 | } | ||
| 198 | } | ||
| 199 | EXPORT_SYMBOL(__frontswap_invalidate_page); | ||
| 200 | |||
| 201 | /* | ||
| 202 | * Invalidate all data from frontswap associated with all offsets for the | ||
| 203 | * specified swaptype. | ||
| 204 | */ | ||
| 205 | void __frontswap_invalidate_area(unsigned type) | ||
| 206 | { | ||
| 207 | struct swap_info_struct *sis = swap_info[type]; | ||
| 208 | |||
| 209 | BUG_ON(sis == NULL); | ||
| 210 | if (sis->frontswap_map == NULL) | ||
| 211 | return; | ||
| 212 | (*frontswap_ops.invalidate_area)(type); | ||
| 213 | atomic_set(&sis->frontswap_pages, 0); | ||
| 214 | memset(sis->frontswap_map, 0, sis->max / sizeof(long)); | ||
| 215 | } | ||
| 216 | EXPORT_SYMBOL(__frontswap_invalidate_area); | ||
| 217 | |||
| 218 | /* | ||
| 219 | * Frontswap, like a true swap device, may unnecessarily retain pages | ||
| 220 | * under certain circumstances; "shrink" frontswap is essentially a | ||
| 221 | * "partial swapoff" and works by calling try_to_unuse to attempt to | ||
| 222 | * unuse enough frontswap pages to attempt to -- subject to memory | ||
| 223 | * constraints -- reduce the number of pages in frontswap to the | ||
| 224 | * number given in the parameter target_pages. | ||
| 225 | */ | ||
| 226 | void frontswap_shrink(unsigned long target_pages) | ||
| 227 | { | ||
| 228 | struct swap_info_struct *si = NULL; | ||
| 229 | int si_frontswap_pages; | ||
| 230 | unsigned long total_pages = 0, total_pages_to_unuse; | ||
| 231 | unsigned long pages = 0, pages_to_unuse = 0; | ||
| 232 | int type; | ||
| 233 | bool locked = false; | ||
| 234 | |||
| 235 | /* | ||
| 236 | * we don't want to hold swap_lock while doing a very | ||
| 237 | * lengthy try_to_unuse, but swap_list may change | ||
| 238 | * so restart scan from swap_list.head each time | ||
| 239 | */ | ||
| 240 | spin_lock(&swap_lock); | ||
| 241 | locked = true; | ||
| 242 | total_pages = 0; | ||
| 243 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
| 244 | si = swap_info[type]; | ||
| 245 | total_pages += atomic_read(&si->frontswap_pages); | ||
| 246 | } | ||
| 247 | if (total_pages <= target_pages) | ||
| 248 | goto out; | ||
| 249 | total_pages_to_unuse = total_pages - target_pages; | ||
| 250 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
| 251 | si = swap_info[type]; | ||
| 252 | si_frontswap_pages = atomic_read(&si->frontswap_pages); | ||
| 253 | if (total_pages_to_unuse < si_frontswap_pages) | ||
| 254 | pages = pages_to_unuse = total_pages_to_unuse; | ||
| 255 | else { | ||
| 256 | pages = si_frontswap_pages; | ||
| 257 | pages_to_unuse = 0; /* unuse all */ | ||
| 258 | } | ||
| 259 | /* ensure there is enough RAM to fetch pages from frontswap */ | ||
| 260 | if (security_vm_enough_memory_mm(current->mm, pages)) | ||
| 261 | continue; | ||
| 262 | vm_unacct_memory(pages); | ||
| 263 | break; | ||
| 264 | } | ||
| 265 | if (type < 0) | ||
| 266 | goto out; | ||
| 267 | locked = false; | ||
| 268 | spin_unlock(&swap_lock); | ||
| 269 | try_to_unuse(type, true, pages_to_unuse); | ||
| 270 | out: | ||
| 271 | if (locked) | ||
| 272 | spin_unlock(&swap_lock); | ||
| 273 | return; | ||
| 274 | } | ||
| 275 | EXPORT_SYMBOL(frontswap_shrink); | ||
| 276 | |||
| 277 | /* | ||
| 278 | * Count and return the number of frontswap pages across all | ||
| 279 | * swap devices. This is exported so that backend drivers can | ||
| 280 | * determine current usage without reading debugfs. | ||
| 281 | */ | ||
| 282 | unsigned long frontswap_curr_pages(void) | ||
| 283 | { | ||
| 284 | int type; | ||
| 285 | unsigned long totalpages = 0; | ||
| 286 | struct swap_info_struct *si = NULL; | ||
| 287 | |||
| 288 | spin_lock(&swap_lock); | ||
| 289 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
| 290 | si = swap_info[type]; | ||
| 291 | totalpages += atomic_read(&si->frontswap_pages); | ||
| 292 | } | ||
| 293 | spin_unlock(&swap_lock); | ||
| 294 | return totalpages; | ||
| 295 | } | ||
| 296 | EXPORT_SYMBOL(frontswap_curr_pages); | ||
| 297 | |||
| 298 | static int __init init_frontswap(void) | ||
| 299 | { | ||
| 300 | #ifdef CONFIG_DEBUG_FS | ||
| 301 | struct dentry *root = debugfs_create_dir("frontswap", NULL); | ||
| 302 | if (root == NULL) | ||
| 303 | return -ENXIO; | ||
| 304 | debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads); | ||
| 305 | debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores); | ||
| 306 | debugfs_create_u64("failed_stores", S_IRUGO, root, | ||
| 307 | &frontswap_failed_stores); | ||
| 308 | debugfs_create_u64("invalidates", S_IRUGO, | ||
| 309 | root, &frontswap_invalidates); | ||
| 310 | #endif | ||
| 311 | return 0; | ||
| 312 | } | ||
| 313 | |||
| 314 | module_init(init_frontswap); | ||
diff --git a/mm/nommu.c b/mm/nommu.c index c4acfbc09972..d4b0c10872de 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -1486,7 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
| 1486 | 1486 | ||
| 1487 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | 1487 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); |
| 1488 | 1488 | ||
| 1489 | ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); | 1489 | retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); |
| 1490 | 1490 | ||
| 1491 | if (file) | 1491 | if (file) |
| 1492 | fput(file); | 1492 | fput(file); |
diff --git a/mm/page_io.c b/mm/page_io.c index dc76b4d0611e..34f02923744c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
| @@ -18,6 +18,7 @@ | |||
| 18 | #include <linux/bio.h> | 18 | #include <linux/bio.h> |
| 19 | #include <linux/swapops.h> | 19 | #include <linux/swapops.h> |
| 20 | #include <linux/writeback.h> | 20 | #include <linux/writeback.h> |
| 21 | #include <linux/frontswap.h> | ||
| 21 | #include <asm/pgtable.h> | 22 | #include <asm/pgtable.h> |
| 22 | 23 | ||
| 23 | static struct bio *get_swap_bio(gfp_t gfp_flags, | 24 | static struct bio *get_swap_bio(gfp_t gfp_flags, |
| @@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
| 98 | unlock_page(page); | 99 | unlock_page(page); |
| 99 | goto out; | 100 | goto out; |
| 100 | } | 101 | } |
| 102 | if (frontswap_store(page) == 0) { | ||
| 103 | set_page_writeback(page); | ||
| 104 | unlock_page(page); | ||
| 105 | end_page_writeback(page); | ||
| 106 | goto out; | ||
| 107 | } | ||
| 101 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); | 108 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); |
| 102 | if (bio == NULL) { | 109 | if (bio == NULL) { |
| 103 | set_page_dirty(page); | 110 | set_page_dirty(page); |
| @@ -122,6 +129,11 @@ int swap_readpage(struct page *page) | |||
| 122 | 129 | ||
| 123 | VM_BUG_ON(!PageLocked(page)); | 130 | VM_BUG_ON(!PageLocked(page)); |
| 124 | VM_BUG_ON(PageUptodate(page)); | 131 | VM_BUG_ON(PageUptodate(page)); |
| 132 | if (frontswap_load(page) == 0) { | ||
| 133 | SetPageUptodate(page); | ||
| 134 | unlock_page(page); | ||
| 135 | goto out; | ||
| 136 | } | ||
| 125 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); | 137 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); |
| 126 | if (bio == NULL) { | 138 | if (bio == NULL) { |
| 127 | unlock_page(page); | 139 | unlock_page(page); |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 457b10baef59..de5bc51c4a66 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -31,6 +31,8 @@ | |||
| 31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
| 32 | #include <linux/poll.h> | 32 | #include <linux/poll.h> |
| 33 | #include <linux/oom.h> | 33 | #include <linux/oom.h> |
| 34 | #include <linux/frontswap.h> | ||
| 35 | #include <linux/swapfile.h> | ||
| 34 | 36 | ||
| 35 | #include <asm/pgtable.h> | 37 | #include <asm/pgtable.h> |
| 36 | #include <asm/tlbflush.h> | 38 | #include <asm/tlbflush.h> |
| @@ -42,7 +44,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, | |||
| 42 | static void free_swap_count_continuations(struct swap_info_struct *); | 44 | static void free_swap_count_continuations(struct swap_info_struct *); |
| 43 | static sector_t map_swap_entry(swp_entry_t, struct block_device**); | 45 | static sector_t map_swap_entry(swp_entry_t, struct block_device**); |
| 44 | 46 | ||
| 45 | static DEFINE_SPINLOCK(swap_lock); | 47 | DEFINE_SPINLOCK(swap_lock); |
| 46 | static unsigned int nr_swapfiles; | 48 | static unsigned int nr_swapfiles; |
| 47 | long nr_swap_pages; | 49 | long nr_swap_pages; |
| 48 | long total_swap_pages; | 50 | long total_swap_pages; |
| @@ -53,9 +55,9 @@ static const char Unused_file[] = "Unused swap file entry "; | |||
| 53 | static const char Bad_offset[] = "Bad swap offset entry "; | 55 | static const char Bad_offset[] = "Bad swap offset entry "; |
| 54 | static const char Unused_offset[] = "Unused swap offset entry "; | 56 | static const char Unused_offset[] = "Unused swap offset entry "; |
| 55 | 57 | ||
| 56 | static struct swap_list_t swap_list = {-1, -1}; | 58 | struct swap_list_t swap_list = {-1, -1}; |
| 57 | 59 | ||
| 58 | static struct swap_info_struct *swap_info[MAX_SWAPFILES]; | 60 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
| 59 | 61 | ||
| 60 | static DEFINE_MUTEX(swapon_mutex); | 62 | static DEFINE_MUTEX(swapon_mutex); |
| 61 | 63 | ||
| @@ -556,6 +558,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
| 556 | swap_list.next = p->type; | 558 | swap_list.next = p->type; |
| 557 | nr_swap_pages++; | 559 | nr_swap_pages++; |
| 558 | p->inuse_pages--; | 560 | p->inuse_pages--; |
| 561 | frontswap_invalidate_page(p->type, offset); | ||
| 559 | if ((p->flags & SWP_BLKDEV) && | 562 | if ((p->flags & SWP_BLKDEV) && |
| 560 | disk->fops->swap_slot_free_notify) | 563 | disk->fops->swap_slot_free_notify) |
| 561 | disk->fops->swap_slot_free_notify(p->bdev, offset); | 564 | disk->fops->swap_slot_free_notify(p->bdev, offset); |
| @@ -985,11 +988,12 @@ static int unuse_mm(struct mm_struct *mm, | |||
| 985 | } | 988 | } |
| 986 | 989 | ||
| 987 | /* | 990 | /* |
| 988 | * Scan swap_map from current position to next entry still in use. | 991 | * Scan swap_map (or frontswap_map if frontswap parameter is true) |
| 992 | * from current position to next entry still in use. | ||
| 989 | * Recycle to start on reaching the end, returning 0 when empty. | 993 | * Recycle to start on reaching the end, returning 0 when empty. |
| 990 | */ | 994 | */ |
| 991 | static unsigned int find_next_to_unuse(struct swap_info_struct *si, | 995 | static unsigned int find_next_to_unuse(struct swap_info_struct *si, |
| 992 | unsigned int prev) | 996 | unsigned int prev, bool frontswap) |
| 993 | { | 997 | { |
| 994 | unsigned int max = si->max; | 998 | unsigned int max = si->max; |
| 995 | unsigned int i = prev; | 999 | unsigned int i = prev; |
| @@ -1015,6 +1019,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
| 1015 | prev = 0; | 1019 | prev = 0; |
| 1016 | i = 1; | 1020 | i = 1; |
| 1017 | } | 1021 | } |
| 1022 | if (frontswap) { | ||
| 1023 | if (frontswap_test(si, i)) | ||
| 1024 | break; | ||
| 1025 | else | ||
| 1026 | continue; | ||
| 1027 | } | ||
| 1018 | count = si->swap_map[i]; | 1028 | count = si->swap_map[i]; |
| 1019 | if (count && swap_count(count) != SWAP_MAP_BAD) | 1029 | if (count && swap_count(count) != SWAP_MAP_BAD) |
| 1020 | break; | 1030 | break; |
| @@ -1026,8 +1036,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
| 1026 | * We completely avoid races by reading each swap page in advance, | 1036 | * We completely avoid races by reading each swap page in advance, |
| 1027 | * and then search for the process using it. All the necessary | 1037 | * and then search for the process using it. All the necessary |
| 1028 | * page table adjustments can then be made atomically. | 1038 | * page table adjustments can then be made atomically. |
| 1039 | * | ||
| 1040 | * if the boolean frontswap is true, only unuse pages_to_unuse pages; | ||
| 1041 | * pages_to_unuse==0 means all pages; ignored if frontswap is false | ||
| 1029 | */ | 1042 | */ |
| 1030 | static int try_to_unuse(unsigned int type) | 1043 | int try_to_unuse(unsigned int type, bool frontswap, |
| 1044 | unsigned long pages_to_unuse) | ||
| 1031 | { | 1045 | { |
| 1032 | struct swap_info_struct *si = swap_info[type]; | 1046 | struct swap_info_struct *si = swap_info[type]; |
| 1033 | struct mm_struct *start_mm; | 1047 | struct mm_struct *start_mm; |
| @@ -1060,7 +1074,7 @@ static int try_to_unuse(unsigned int type) | |||
| 1060 | * one pass through swap_map is enough, but not necessarily: | 1074 | * one pass through swap_map is enough, but not necessarily: |
| 1061 | * there are races when an instance of an entry might be missed. | 1075 | * there are races when an instance of an entry might be missed. |
| 1062 | */ | 1076 | */ |
| 1063 | while ((i = find_next_to_unuse(si, i)) != 0) { | 1077 | while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { |
| 1064 | if (signal_pending(current)) { | 1078 | if (signal_pending(current)) { |
| 1065 | retval = -EINTR; | 1079 | retval = -EINTR; |
| 1066 | break; | 1080 | break; |
| @@ -1227,6 +1241,10 @@ static int try_to_unuse(unsigned int type) | |||
| 1227 | * interactive performance. | 1241 | * interactive performance. |
| 1228 | */ | 1242 | */ |
| 1229 | cond_resched(); | 1243 | cond_resched(); |
| 1244 | if (frontswap && pages_to_unuse > 0) { | ||
| 1245 | if (!--pages_to_unuse) | ||
| 1246 | break; | ||
| 1247 | } | ||
| 1230 | } | 1248 | } |
| 1231 | 1249 | ||
| 1232 | mmput(start_mm); | 1250 | mmput(start_mm); |
| @@ -1486,7 +1504,8 @@ bad_bmap: | |||
| 1486 | } | 1504 | } |
| 1487 | 1505 | ||
| 1488 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 1506 | static void enable_swap_info(struct swap_info_struct *p, int prio, |
| 1489 | unsigned char *swap_map) | 1507 | unsigned char *swap_map, |
| 1508 | unsigned long *frontswap_map) | ||
| 1490 | { | 1509 | { |
| 1491 | int i, prev; | 1510 | int i, prev; |
| 1492 | 1511 | ||
| @@ -1496,6 +1515,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
| 1496 | else | 1515 | else |
| 1497 | p->prio = --least_priority; | 1516 | p->prio = --least_priority; |
| 1498 | p->swap_map = swap_map; | 1517 | p->swap_map = swap_map; |
| 1518 | frontswap_map_set(p, frontswap_map); | ||
| 1499 | p->flags |= SWP_WRITEOK; | 1519 | p->flags |= SWP_WRITEOK; |
| 1500 | nr_swap_pages += p->pages; | 1520 | nr_swap_pages += p->pages; |
| 1501 | total_swap_pages += p->pages; | 1521 | total_swap_pages += p->pages; |
| @@ -1512,6 +1532,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
| 1512 | swap_list.head = swap_list.next = p->type; | 1532 | swap_list.head = swap_list.next = p->type; |
| 1513 | else | 1533 | else |
| 1514 | swap_info[prev]->next = p->type; | 1534 | swap_info[prev]->next = p->type; |
| 1535 | frontswap_init(p->type); | ||
| 1515 | spin_unlock(&swap_lock); | 1536 | spin_unlock(&swap_lock); |
| 1516 | } | 1537 | } |
| 1517 | 1538 | ||
| @@ -1585,7 +1606,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1585 | spin_unlock(&swap_lock); | 1606 | spin_unlock(&swap_lock); |
| 1586 | 1607 | ||
| 1587 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); | 1608 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); |
| 1588 | err = try_to_unuse(type); | 1609 | err = try_to_unuse(type, false, 0); /* force all pages to be unused */ |
| 1589 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); | 1610 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); |
| 1590 | 1611 | ||
| 1591 | if (err) { | 1612 | if (err) { |
| @@ -1596,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1596 | * sys_swapoff for this swap_info_struct at this point. | 1617 | * sys_swapoff for this swap_info_struct at this point. |
| 1597 | */ | 1618 | */ |
| 1598 | /* re-insert swap space back into swap_list */ | 1619 | /* re-insert swap space back into swap_list */ |
| 1599 | enable_swap_info(p, p->prio, p->swap_map); | 1620 | enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); |
| 1600 | goto out_dput; | 1621 | goto out_dput; |
| 1601 | } | 1622 | } |
| 1602 | 1623 | ||
| @@ -1622,9 +1643,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1622 | swap_map = p->swap_map; | 1643 | swap_map = p->swap_map; |
| 1623 | p->swap_map = NULL; | 1644 | p->swap_map = NULL; |
| 1624 | p->flags = 0; | 1645 | p->flags = 0; |
| 1646 | frontswap_invalidate_area(type); | ||
| 1625 | spin_unlock(&swap_lock); | 1647 | spin_unlock(&swap_lock); |
| 1626 | mutex_unlock(&swapon_mutex); | 1648 | mutex_unlock(&swapon_mutex); |
| 1627 | vfree(swap_map); | 1649 | vfree(swap_map); |
| 1650 | vfree(frontswap_map_get(p)); | ||
| 1628 | /* Destroy swap account informatin */ | 1651 | /* Destroy swap account informatin */ |
| 1629 | swap_cgroup_swapoff(type); | 1652 | swap_cgroup_swapoff(type); |
| 1630 | 1653 | ||
| @@ -1988,6 +2011,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
| 1988 | sector_t span; | 2011 | sector_t span; |
| 1989 | unsigned long maxpages; | 2012 | unsigned long maxpages; |
| 1990 | unsigned char *swap_map = NULL; | 2013 | unsigned char *swap_map = NULL; |
| 2014 | unsigned long *frontswap_map = NULL; | ||
| 1991 | struct page *page = NULL; | 2015 | struct page *page = NULL; |
| 1992 | struct inode *inode = NULL; | 2016 | struct inode *inode = NULL; |
| 1993 | 2017 | ||
| @@ -2071,6 +2095,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
| 2071 | error = nr_extents; | 2095 | error = nr_extents; |
| 2072 | goto bad_swap; | 2096 | goto bad_swap; |
| 2073 | } | 2097 | } |
| 2098 | /* frontswap enabled? set up bit-per-page map for frontswap */ | ||
| 2099 | if (frontswap_enabled) | ||
| 2100 | frontswap_map = vzalloc(maxpages / sizeof(long)); | ||
| 2074 | 2101 | ||
| 2075 | if (p->bdev) { | 2102 | if (p->bdev) { |
| 2076 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { | 2103 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { |
| @@ -2086,14 +2113,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
| 2086 | if (swap_flags & SWAP_FLAG_PREFER) | 2113 | if (swap_flags & SWAP_FLAG_PREFER) |
| 2087 | prio = | 2114 | prio = |
| 2088 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; | 2115 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; |
| 2089 | enable_swap_info(p, prio, swap_map); | 2116 | enable_swap_info(p, prio, swap_map, frontswap_map); |
| 2090 | 2117 | ||
| 2091 | printk(KERN_INFO "Adding %uk swap on %s. " | 2118 | printk(KERN_INFO "Adding %uk swap on %s. " |
| 2092 | "Priority:%d extents:%d across:%lluk %s%s\n", | 2119 | "Priority:%d extents:%d across:%lluk %s%s%s\n", |
| 2093 | p->pages<<(PAGE_SHIFT-10), name, p->prio, | 2120 | p->pages<<(PAGE_SHIFT-10), name, p->prio, |
| 2094 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), | 2121 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), |
| 2095 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", | 2122 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", |
| 2096 | (p->flags & SWP_DISCARDABLE) ? "D" : ""); | 2123 | (p->flags & SWP_DISCARDABLE) ? "D" : "", |
| 2124 | (frontswap_map) ? "FS" : ""); | ||
| 2097 | 2125 | ||
| 2098 | mutex_unlock(&swapon_mutex); | 2126 | mutex_unlock(&swapon_mutex); |
| 2099 | atomic_inc(&proc_poll_event); | 2127 | atomic_inc(&proc_poll_event); |
