aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMinchan Kim <minchan@kernel.org>2015-04-15 19:15:23 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-04-15 19:35:20 -0400
commit2e40e163a25af3bd35d128d3e2e005916de5cce6 (patch)
tree9bda77ac0d126b198c18ddb9eedf5e2b32061449 /mm
parent018e9a49a554d915ba945a5faf34c592d65fe575 (diff)
zsmalloc: decouple handle and object
Recently, we started to use zram heavily and some of issues popped. 1) external fragmentation I got a report from Juneho Choi that fork failed although there are plenty of free pages in the system. His investigation revealed zram is one of the culprit to make heavy fragmentation so there was no more contiguous 16K page for pgd to fork in the ARM. 2) non-movable pages Other problem of zram now is that inherently, user want to use zram as swap in small memory system so they use zRAM with CMA to use memory efficiently. However, unfortunately, it doesn't work well because zRAM cannot use CMA's movable pages unless it doesn't support compaction. I got several reports about that OOM happened with zram although there are lots of swap space and free space in CMA area. 3) internal fragmentation zRAM has started support memory limitation feature to limit memory usage and I sent a patchset(https://lkml.org/lkml/2014/9/21/148) for VM to be harmonized with zram-swap to stop anonymous page reclaim if zram consumed memory up to the limit although there are free space on the swap. One problem for that direction is zram has no way to know any hole in memory space zsmalloc allocated by internal fragmentation so zram would regard swap is full although there are free space in zsmalloc. For solving the issue, zram want to trigger compaction of zsmalloc before it decides full or not. This patchset is first step to support above issues. For that, it adds indirect layer between handle and object location and supports manual compaction to solve 3th problem first of all. After this patchset got merged, next step is to make VM aware of zsmalloc compaction so that generic compaction will move zsmalloced-pages automatically in runtime. In my imaginary experiment(ie, high compress ratio data with heavy swap in/out on 8G zram-swap), data is as follows, Before = zram allocated object : 60212066 bytes zram total used: 140103680 bytes ratio: 42.98 percent MemFree: 840192 kB Compaction After = frag ratio after compaction zram allocated object : 60212066 bytes zram total used: 76185600 bytes ratio: 79.03 percent MemFree: 901932 kB Juneho reported below in his real platform with small aging. So, I think the benefit would be bigger in real aging system for a long time. - frag_ratio increased 3% (ie, higher is better) - memfree increased about 6MB - In buddy info, Normal 2^3: 4, 2^2: 1: 2^1 increased, Highmem: 2^1 21 increased frag ratio after swap fragment used : 156677 kbytes total: 166092 kbytes frag_ratio : 94 meminfo before compaction MemFree: 83724 kB Node 0, zone Normal 13642 1364 57 10 61 17 9 5 4 0 0 Node 0, zone HighMem 425 29 1 0 0 0 0 0 0 0 0 num_migrated : 23630 compaction done frag ratio after compaction used : 156673 kbytes total: 160564 kbytes frag_ratio : 97 meminfo after compaction MemFree: 89060 kB Node 0, zone Normal 14076 1544 67 14 61 17 9 5 4 0 0 Node 0, zone HighMem 863 50 1 0 0 0 0 0 0 0 0 This patchset adds more logics(about 480 lines) in zsmalloc but when I tested heavy swapin/out program, the regression for swapin/out speed is marginal because most of overheads were caused by compress/decompress and other MM reclaim stuff. This patch (of 7): Currently, handle of zsmalloc encodes object's location directly so it makes support of migration hard. This patch decouples handle and object via adding indirect layer. For that, it allocates handle dynamically and returns it to user. The handle is the address allocated by slab allocation so it's unique and we could keep object's location in the memory space allocated for handle. With it, we can change object's position without changing handle itself. Signed-off-by: Minchan Kim <minchan@kernel.org> Cc: Juneho Choi <juno.choi@lge.com> Cc: Gunho Lee <gunho.lee@lge.com> Cc: Luigi Semenzato <semenzato@google.com> Cc: Dan Streetman <ddstreet@ieee.org> Cc: Seth Jennings <sjennings@variantweb.net> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Jerome Marchand <jmarchan@redhat.com> Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/zsmalloc.c126
1 files changed, 98 insertions, 28 deletions
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0dec1fa5f656..6f3cfbf5e237 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -110,6 +110,8 @@
110#define ZS_MAX_ZSPAGE_ORDER 2 110#define ZS_MAX_ZSPAGE_ORDER 2
111#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) 111#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
112 112
113#define ZS_HANDLE_SIZE (sizeof(unsigned long))
114
113/* 115/*
114 * Object location (<PFN>, <obj_idx>) is encoded as 116 * Object location (<PFN>, <obj_idx>) is encoded as
115 * as single (unsigned long) handle value. 117 * as single (unsigned long) handle value.
@@ -140,7 +142,8 @@
140/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ 142/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
141#define ZS_MIN_ALLOC_SIZE \ 143#define ZS_MIN_ALLOC_SIZE \
142 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) 144 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
143#define ZS_MAX_ALLOC_SIZE PAGE_SIZE 145/* each chunk includes extra space to keep handle */
146#define ZS_MAX_ALLOC_SIZE (PAGE_SIZE + ZS_HANDLE_SIZE)
144 147
145/* 148/*
146 * On systems with 4K page size, this gives 255 size classes! There is a 149 * On systems with 4K page size, this gives 255 size classes! There is a
@@ -233,14 +236,24 @@ struct size_class {
233 * This must be power of 2 and less than or equal to ZS_ALIGN 236 * This must be power of 2 and less than or equal to ZS_ALIGN
234 */ 237 */
235struct link_free { 238struct link_free {
236 /* Handle of next free chunk (encodes <PFN, obj_idx>) */ 239 union {
237 void *next; 240 /*
241 * Position of next free chunk (encodes <PFN, obj_idx>)
242 * It's valid for non-allocated object
243 */
244 void *next;
245 /*
246 * Handle of allocated object.
247 */
248 unsigned long handle;
249 };
238}; 250};
239 251
240struct zs_pool { 252struct zs_pool {
241 char *name; 253 char *name;
242 254
243 struct size_class **size_class; 255 struct size_class **size_class;
256 struct kmem_cache *handle_cachep;
244 257
245 gfp_t flags; /* allocation flags used when growing pool */ 258 gfp_t flags; /* allocation flags used when growing pool */
246 atomic_long_t pages_allocated; 259 atomic_long_t pages_allocated;
@@ -269,6 +282,34 @@ struct mapping_area {
269 enum zs_mapmode vm_mm; /* mapping mode */ 282 enum zs_mapmode vm_mm; /* mapping mode */
270}; 283};
271 284
285static int create_handle_cache(struct zs_pool *pool)
286{
287 pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
288 0, 0, NULL);
289 return pool->handle_cachep ? 0 : 1;
290}
291
292static void destroy_handle_cache(struct zs_pool *pool)
293{
294 kmem_cache_destroy(pool->handle_cachep);
295}
296
297static unsigned long alloc_handle(struct zs_pool *pool)
298{
299 return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
300 pool->flags & ~__GFP_HIGHMEM);
301}
302
303static void free_handle(struct zs_pool *pool, unsigned long handle)
304{
305 kmem_cache_free(pool->handle_cachep, (void *)handle);
306}
307
308static void record_obj(unsigned long handle, unsigned long obj)
309{
310 *(unsigned long *)handle = obj;
311}
312
272/* zpool driver */ 313/* zpool driver */
273 314
274#ifdef CONFIG_ZPOOL 315#ifdef CONFIG_ZPOOL
@@ -595,13 +636,18 @@ static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
595 * decoded obj_idx back to its original value since it was adjusted in 636 * decoded obj_idx back to its original value since it was adjusted in
596 * obj_location_to_handle(). 637 * obj_location_to_handle().
597 */ 638 */
598static void obj_handle_to_location(unsigned long handle, struct page **page, 639static void obj_to_location(unsigned long handle, struct page **page,
599 unsigned long *obj_idx) 640 unsigned long *obj_idx)
600{ 641{
601 *page = pfn_to_page(handle >> OBJ_INDEX_BITS); 642 *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
602 *obj_idx = (handle & OBJ_INDEX_MASK) - 1; 643 *obj_idx = (handle & OBJ_INDEX_MASK) - 1;
603} 644}
604 645
646static unsigned long handle_to_obj(unsigned long handle)
647{
648 return *(unsigned long *)handle;
649}
650
605static unsigned long obj_idx_to_offset(struct page *page, 651static unsigned long obj_idx_to_offset(struct page *page,
606 unsigned long obj_idx, int class_size) 652 unsigned long obj_idx, int class_size)
607{ 653{
@@ -860,12 +906,16 @@ static void __zs_unmap_object(struct mapping_area *area,
860{ 906{
861 int sizes[2]; 907 int sizes[2];
862 void *addr; 908 void *addr;
863 char *buf = area->vm_buf; 909 char *buf;
864 910
865 /* no write fastpath */ 911 /* no write fastpath */
866 if (area->vm_mm == ZS_MM_RO) 912 if (area->vm_mm == ZS_MM_RO)
867 goto out; 913 goto out;
868 914
915 buf = area->vm_buf + ZS_HANDLE_SIZE;
916 size -= ZS_HANDLE_SIZE;
917 off += ZS_HANDLE_SIZE;
918
869 sizes[0] = PAGE_SIZE - off; 919 sizes[0] = PAGE_SIZE - off;
870 sizes[1] = size - sizes[0]; 920 sizes[1] = size - sizes[0];
871 921
@@ -1153,13 +1203,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1153 enum zs_mapmode mm) 1203 enum zs_mapmode mm)
1154{ 1204{
1155 struct page *page; 1205 struct page *page;
1156 unsigned long obj_idx, off; 1206 unsigned long obj, obj_idx, off;
1157 1207
1158 unsigned int class_idx; 1208 unsigned int class_idx;
1159 enum fullness_group fg; 1209 enum fullness_group fg;
1160 struct size_class *class; 1210 struct size_class *class;
1161 struct mapping_area *area; 1211 struct mapping_area *area;
1162 struct page *pages[2]; 1212 struct page *pages[2];
1213 void *ret;
1163 1214
1164 BUG_ON(!handle); 1215 BUG_ON(!handle);
1165 1216
@@ -1170,7 +1221,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1170 */ 1221 */
1171 BUG_ON(in_interrupt()); 1222 BUG_ON(in_interrupt());
1172 1223
1173 obj_handle_to_location(handle, &page, &obj_idx); 1224 obj = handle_to_obj(handle);
1225 obj_to_location(obj, &page, &obj_idx);
1174 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1226 get_zspage_mapping(get_first_page(page), &class_idx, &fg);
1175 class = pool->size_class[class_idx]; 1227 class = pool->size_class[class_idx];
1176 off = obj_idx_to_offset(page, obj_idx, class->size); 1228 off = obj_idx_to_offset(page, obj_idx, class->size);
@@ -1180,7 +1232,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1180 if (off + class->size <= PAGE_SIZE) { 1232 if (off + class->size <= PAGE_SIZE) {
1181 /* this object is contained entirely within a page */ 1233 /* this object is contained entirely within a page */
1182 area->vm_addr = kmap_atomic(page); 1234 area->vm_addr = kmap_atomic(page);
1183 return area->vm_addr + off; 1235 ret = area->vm_addr + off;
1236 goto out;
1184 } 1237 }
1185 1238
1186 /* this object spans two pages */ 1239 /* this object spans two pages */
@@ -1188,14 +1241,16 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1188 pages[1] = get_next_page(page); 1241 pages[1] = get_next_page(page);
1189 BUG_ON(!pages[1]); 1242 BUG_ON(!pages[1]);
1190 1243
1191 return __zs_map_object(area, pages, off, class->size); 1244 ret = __zs_map_object(area, pages, off, class->size);
1245out:
1246 return ret + ZS_HANDLE_SIZE;
1192} 1247}
1193EXPORT_SYMBOL_GPL(zs_map_object); 1248EXPORT_SYMBOL_GPL(zs_map_object);
1194 1249
1195void zs_unmap_object(struct zs_pool *pool, unsigned long handle) 1250void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1196{ 1251{
1197 struct page *page; 1252 struct page *page;
1198 unsigned long obj_idx, off; 1253 unsigned long obj, obj_idx, off;
1199 1254
1200 unsigned int class_idx; 1255 unsigned int class_idx;
1201 enum fullness_group fg; 1256 enum fullness_group fg;
@@ -1204,7 +1259,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1204 1259
1205 BUG_ON(!handle); 1260 BUG_ON(!handle);
1206 1261
1207 obj_handle_to_location(handle, &page, &obj_idx); 1262 obj = handle_to_obj(handle);
1263 obj_to_location(obj, &page, &obj_idx);
1208 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1264 get_zspage_mapping(get_first_page(page), &class_idx, &fg);
1209 class = pool->size_class[class_idx]; 1265 class = pool->size_class[class_idx];
1210 off = obj_idx_to_offset(page, obj_idx, class->size); 1266 off = obj_idx_to_offset(page, obj_idx, class->size);
@@ -1236,7 +1292,7 @@ EXPORT_SYMBOL_GPL(zs_unmap_object);
1236 */ 1292 */
1237unsigned long zs_malloc(struct zs_pool *pool, size_t size) 1293unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1238{ 1294{
1239 unsigned long obj; 1295 unsigned long handle, obj;
1240 struct link_free *link; 1296 struct link_free *link;
1241 struct size_class *class; 1297 struct size_class *class;
1242 void *vaddr; 1298 void *vaddr;
@@ -1244,9 +1300,15 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1244 struct page *first_page, *m_page; 1300 struct page *first_page, *m_page;
1245 unsigned long m_objidx, m_offset; 1301 unsigned long m_objidx, m_offset;
1246 1302
1247 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) 1303 if (unlikely(!size || (size + ZS_HANDLE_SIZE) > ZS_MAX_ALLOC_SIZE))
1304 return 0;
1305
1306 handle = alloc_handle(pool);
1307 if (!handle)
1248 return 0; 1308 return 0;
1249 1309
1310 /* extra space in chunk to keep the handle */
1311 size += ZS_HANDLE_SIZE;
1250 class = pool->size_class[get_size_class_index(size)]; 1312 class = pool->size_class[get_size_class_index(size)];
1251 1313
1252 spin_lock(&class->lock); 1314 spin_lock(&class->lock);
@@ -1255,8 +1317,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1255 if (!first_page) { 1317 if (!first_page) {
1256 spin_unlock(&class->lock); 1318 spin_unlock(&class->lock);
1257 first_page = alloc_zspage(class, pool->flags); 1319 first_page = alloc_zspage(class, pool->flags);
1258 if (unlikely(!first_page)) 1320 if (unlikely(!first_page)) {
1321 free_handle(pool, handle);
1259 return 0; 1322 return 0;
1323 }
1260 1324
1261 set_zspage_mapping(first_page, class->index, ZS_EMPTY); 1325 set_zspage_mapping(first_page, class->index, ZS_EMPTY);
1262 atomic_long_add(class->pages_per_zspage, 1326 atomic_long_add(class->pages_per_zspage,
@@ -1268,40 +1332,45 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1268 } 1332 }
1269 1333
1270 obj = (unsigned long)first_page->freelist; 1334 obj = (unsigned long)first_page->freelist;
1271 obj_handle_to_location(obj, &m_page, &m_objidx); 1335 obj_to_location(obj, &m_page, &m_objidx);
1272 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); 1336 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
1273 1337
1274 vaddr = kmap_atomic(m_page); 1338 vaddr = kmap_atomic(m_page);
1275 link = (struct link_free *)vaddr + m_offset / sizeof(*link); 1339 link = (struct link_free *)vaddr + m_offset / sizeof(*link);
1276 first_page->freelist = link->next; 1340 first_page->freelist = link->next;
1277 memset(link, POISON_INUSE, sizeof(*link)); 1341
1342 /* record handle in the header of allocated chunk */
1343 link->handle = handle;
1278 kunmap_atomic(vaddr); 1344 kunmap_atomic(vaddr);
1279 1345
1280 first_page->inuse++; 1346 first_page->inuse++;
1281 zs_stat_inc(class, OBJ_USED, 1); 1347 zs_stat_inc(class, OBJ_USED, 1);
1282 /* Now move the zspage to another fullness group, if required */ 1348 /* Now move the zspage to another fullness group, if required */
1283 fix_fullness_group(pool, first_page); 1349 fix_fullness_group(pool, first_page);
1350 record_obj(handle, obj);
1284 spin_unlock(&class->lock); 1351 spin_unlock(&class->lock);
1285 1352
1286 return obj; 1353 return handle;
1287} 1354}
1288EXPORT_SYMBOL_GPL(zs_malloc); 1355EXPORT_SYMBOL_GPL(zs_malloc);
1289 1356
1290void zs_free(struct zs_pool *pool, unsigned long obj) 1357void zs_free(struct zs_pool *pool, unsigned long handle)
1291{ 1358{
1292 struct link_free *link; 1359 struct link_free *link;
1293 struct page *first_page, *f_page; 1360 struct page *first_page, *f_page;
1294 unsigned long f_objidx, f_offset; 1361 unsigned long obj, f_objidx, f_offset;
1295 void *vaddr; 1362 void *vaddr;
1296 1363
1297 int class_idx; 1364 int class_idx;
1298 struct size_class *class; 1365 struct size_class *class;
1299 enum fullness_group fullness; 1366 enum fullness_group fullness;
1300 1367
1301 if (unlikely(!obj)) 1368 if (unlikely(!handle))
1302 return; 1369 return;
1303 1370
1304 obj_handle_to_location(obj, &f_page, &f_objidx); 1371 obj = handle_to_obj(handle);
1372 free_handle(pool, handle);
1373 obj_to_location(obj, &f_page, &f_objidx);
1305 first_page = get_first_page(f_page); 1374 first_page = get_first_page(f_page);
1306 1375
1307 get_zspage_mapping(first_page, &class_idx, &fullness); 1376 get_zspage_mapping(first_page, &class_idx, &fullness);
@@ -1355,20 +1424,20 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
1355 if (!pool) 1424 if (!pool)
1356 return NULL; 1425 return NULL;
1357 1426
1358 pool->name = kstrdup(name, GFP_KERNEL);
1359 if (!pool->name) {
1360 kfree(pool);
1361 return NULL;
1362 }
1363
1364 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), 1427 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
1365 GFP_KERNEL); 1428 GFP_KERNEL);
1366 if (!pool->size_class) { 1429 if (!pool->size_class) {
1367 kfree(pool->name);
1368 kfree(pool); 1430 kfree(pool);
1369 return NULL; 1431 return NULL;
1370 } 1432 }
1371 1433
1434 pool->name = kstrdup(name, GFP_KERNEL);
1435 if (!pool->name)
1436 goto err;
1437
1438 if (create_handle_cache(pool))
1439 goto err;
1440
1372 /* 1441 /*
1373 * Iterate reversly, because, size of size_class that we want to use 1442 * Iterate reversly, because, size of size_class that we want to use
1374 * for merging should be larger or equal to current size. 1443 * for merging should be larger or equal to current size.
@@ -1450,6 +1519,7 @@ void zs_destroy_pool(struct zs_pool *pool)
1450 kfree(class); 1519 kfree(class);
1451 } 1520 }
1452 1521
1522 destroy_handle_cache(pool);
1453 kfree(pool->size_class); 1523 kfree(pool->size_class);
1454 kfree(pool->name); 1524 kfree(pool->name);
1455 kfree(pool); 1525 kfree(pool);