aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMinchan Kim <minchan@kernel.org>2016-07-26 18:23:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-07-26 19:19:19 -0400
commit48b4800a1c6af2cdda344ea4e2c843dcc1f6afc9 (patch)
tree3c9b37dc412f32cb18fdd57c98939ae33edc5c26
parentbfd093f5e7f09c1e41c43e7605893069975cd734 (diff)
zsmalloc: page migration support
This patch introduces run-time migration feature for zspage. For migration, VM uses page.lru field so it would be better to not use page.next field which is unified with page.lru for own purpose. For that, firstly, we can get first object offset of the page via runtime calculation instead of using page.index so we can use page.index as link for page chaining instead of page.next. In case of huge object, it stores handle to page.index instead of next link of page chaining because huge object doesn't need to next link for page chaining. So get_next_page need to identify huge object to return NULL. For it, this patch uses PG_owner_priv_1 flag of the page flag. For migration, it supports three functions * zs_page_isolate It isolates a zspage which includes a subpage VM want to migrate from class so anyone cannot allocate new object from the zspage. We could try to isolate a zspage by the number of subpage so subsequent isolation trial of other subpage of the zpsage shouldn't fail. For that, we introduce zspage.isolated count. With that, zs_page_isolate can know whether zspage is already isolated or not for migration so if it is isolated for migration, subsequent isolation trial can be successful without trying further isolation. * zs_page_migrate First of all, it holds write-side zspage->lock to prevent migrate other subpage in zspage. Then, lock all objects in the page VM want to migrate. The reason we should lock all objects in the page is due to race between zs_map_object and zs_page_migrate. zs_map_object zs_page_migrate pin_tag(handle) obj = handle_to_obj(handle) obj_to_location(obj, &page, &obj_idx); write_lock(&zspage->lock) if (!trypin_tag(handle)) goto unpin_object zspage = get_zspage(page); read_lock(&zspage->lock); If zs_page_migrate doesn't do trypin_tag, zs_map_object's page can be stale by migration so it goes crash. If it locks all of objects successfully, it copies content from old page to new one, finally, create new zspage chain with new page. And if it's last isolated subpage in the zspage, put the zspage back to class. * zs_page_putback It returns isolated zspage to right fullness_group list if it fails to migrate a page. If it find a zspage is ZS_EMPTY, it queues zspage freeing to workqueue. See below about async zspage freeing. This patch introduces asynchronous zspage free. The reason to need it is we need page_lock to clear PG_movable but unfortunately, zs_free path should be atomic so the apporach is try to grab page_lock. If it got page_lock of all of pages successfully, it can free zspage immediately. Otherwise, it queues free request and free zspage via workqueue in process context. If zs_free finds the zspage is isolated when it try to free zspage, it delays the freeing until zs_page_putback finds it so it will free free the zspage finally. In this patch, we expand fullness_list from ZS_EMPTY to ZS_FULL. First of all, it will use ZS_EMPTY list for delay freeing. And with adding ZS_FULL list, it makes to identify whether zspage is isolated or not via list_empty(&zspage->list) test. [minchan@kernel.org: zsmalloc: keep first object offset in struct page] Link: http://lkml.kernel.org/r/1465788015-23195-1-git-send-email-minchan@kernel.org [minchan@kernel.org: zsmalloc: zspage sanity check] Link: http://lkml.kernel.org/r/20160603010129.GC3304@bbox Link: http://lkml.kernel.org/r/1464736881-24886-12-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim <minchan@kernel.org> Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/uapi/linux/magic.h1
-rw-r--r--mm/zsmalloc.c769
2 files changed, 654 insertions, 116 deletions
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index d829ce63529d..e398beac67b8 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -81,5 +81,6 @@
81/* Since UDF 2.01 is ISO 13346 based... */ 81/* Since UDF 2.01 is ISO 13346 based... */
82#define UDF_SUPER_MAGIC 0x15013346 82#define UDF_SUPER_MAGIC 0x15013346
83#define BALLOON_KVM_MAGIC 0x13661366 83#define BALLOON_KVM_MAGIC 0x13661366
84#define ZSMALLOC_MAGIC 0x58295829
84 85
85#endif /* __LINUX_MAGIC_H__ */ 86#endif /* __LINUX_MAGIC_H__ */
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index c6fb543cfb98..04a4f063b4fd 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -17,14 +17,14 @@
17 * 17 *
18 * Usage of struct page fields: 18 * Usage of struct page fields:
19 * page->private: points to zspage 19 * page->private: points to zspage
20 * page->index: offset of the first object starting in this page. 20 * page->freelist(index): links together all component pages of a zspage
21 * For the first page, this is always 0, so we use this field 21 * For the huge page, this is always 0, so we use this field
22 * to store handle for huge object. 22 * to store handle.
23 * page->next: links together all component pages of a zspage
24 * 23 *
25 * Usage of struct page flags: 24 * Usage of struct page flags:
26 * PG_private: identifies the first component page 25 * PG_private: identifies the first component page
27 * PG_private2: identifies the last component page 26 * PG_private2: identifies the last component page
27 * PG_owner_priv_1: indentifies the huge component page
28 * 28 *
29 */ 29 */
30 30
@@ -49,6 +49,11 @@
49#include <linux/debugfs.h> 49#include <linux/debugfs.h>
50#include <linux/zsmalloc.h> 50#include <linux/zsmalloc.h>
51#include <linux/zpool.h> 51#include <linux/zpool.h>
52#include <linux/mount.h>
53#include <linux/compaction.h>
54#include <linux/pagemap.h>
55
56#define ZSPAGE_MAGIC 0x58
52 57
53/* 58/*
54 * This must be power of 2 and greater than of equal to sizeof(link_free). 59 * This must be power of 2 and greater than of equal to sizeof(link_free).
@@ -136,25 +141,23 @@
136 * We do not maintain any list for completely empty or full pages 141 * We do not maintain any list for completely empty or full pages
137 */ 142 */
138enum fullness_group { 143enum fullness_group {
139 ZS_ALMOST_FULL,
140 ZS_ALMOST_EMPTY,
141 ZS_EMPTY, 144 ZS_EMPTY,
142 ZS_FULL 145 ZS_ALMOST_EMPTY,
146 ZS_ALMOST_FULL,
147 ZS_FULL,
148 NR_ZS_FULLNESS,
143}; 149};
144 150
145enum zs_stat_type { 151enum zs_stat_type {
152 CLASS_EMPTY,
153 CLASS_ALMOST_EMPTY,
154 CLASS_ALMOST_FULL,
155 CLASS_FULL,
146 OBJ_ALLOCATED, 156 OBJ_ALLOCATED,
147 OBJ_USED, 157 OBJ_USED,
148 CLASS_ALMOST_FULL, 158 NR_ZS_STAT_TYPE,
149 CLASS_ALMOST_EMPTY,
150}; 159};
151 160
152#ifdef CONFIG_ZSMALLOC_STAT
153#define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1)
154#else
155#define NR_ZS_STAT_TYPE (OBJ_USED + 1)
156#endif
157
158struct zs_size_stat { 161struct zs_size_stat {
159 unsigned long objs[NR_ZS_STAT_TYPE]; 162 unsigned long objs[NR_ZS_STAT_TYPE];
160}; 163};
@@ -163,6 +166,10 @@ struct zs_size_stat {
163static struct dentry *zs_stat_root; 166static struct dentry *zs_stat_root;
164#endif 167#endif
165 168
169#ifdef CONFIG_COMPACTION
170static struct vfsmount *zsmalloc_mnt;
171#endif
172
166/* 173/*
167 * number of size_classes 174 * number of size_classes
168 */ 175 */
@@ -186,23 +193,36 @@ static const int fullness_threshold_frac = 4;
186 193
187struct size_class { 194struct size_class {
188 spinlock_t lock; 195 spinlock_t lock;
189 struct list_head fullness_list[2]; 196 struct list_head fullness_list[NR_ZS_FULLNESS];
190 /* 197 /*
191 * Size of objects stored in this class. Must be multiple 198 * Size of objects stored in this class. Must be multiple
192 * of ZS_ALIGN. 199 * of ZS_ALIGN.
193 */ 200 */
194 int size; 201 int size;
195 int objs_per_zspage; 202 int objs_per_zspage;
196 unsigned int index;
197
198 struct zs_size_stat stats;
199
200 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ 203 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
201 int pages_per_zspage; 204 int pages_per_zspage;
202 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ 205
203 bool huge; 206 unsigned int index;
207 struct zs_size_stat stats;
204}; 208};
205 209
210/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
211static void SetPageHugeObject(struct page *page)
212{
213 SetPageOwnerPriv1(page);
214}
215
216static void ClearPageHugeObject(struct page *page)
217{
218 ClearPageOwnerPriv1(page);
219}
220
221static int PageHugeObject(struct page *page)
222{
223 return PageOwnerPriv1(page);
224}
225
206/* 226/*
207 * Placed within free objects to form a singly linked list. 227 * Placed within free objects to form a singly linked list.
208 * For every zspage, zspage->freeobj gives head of this list. 228 * For every zspage, zspage->freeobj gives head of this list.
@@ -244,6 +264,10 @@ struct zs_pool {
244#ifdef CONFIG_ZSMALLOC_STAT 264#ifdef CONFIG_ZSMALLOC_STAT
245 struct dentry *stat_dentry; 265 struct dentry *stat_dentry;
246#endif 266#endif
267#ifdef CONFIG_COMPACTION
268 struct inode *inode;
269 struct work_struct free_work;
270#endif
247}; 271};
248 272
249/* 273/*
@@ -252,16 +276,23 @@ struct zs_pool {
252 */ 276 */
253#define FULLNESS_BITS 2 277#define FULLNESS_BITS 2
254#define CLASS_BITS 8 278#define CLASS_BITS 8
279#define ISOLATED_BITS 3
280#define MAGIC_VAL_BITS 8
255 281
256struct zspage { 282struct zspage {
257 struct { 283 struct {
258 unsigned int fullness:FULLNESS_BITS; 284 unsigned int fullness:FULLNESS_BITS;
259 unsigned int class:CLASS_BITS; 285 unsigned int class:CLASS_BITS;
286 unsigned int isolated:ISOLATED_BITS;
287 unsigned int magic:MAGIC_VAL_BITS;
260 }; 288 };
261 unsigned int inuse; 289 unsigned int inuse;
262 unsigned int freeobj; 290 unsigned int freeobj;
263 struct page *first_page; 291 struct page *first_page;
264 struct list_head list; /* fullness list */ 292 struct list_head list; /* fullness list */
293#ifdef CONFIG_COMPACTION
294 rwlock_t lock;
295#endif
265}; 296};
266 297
267struct mapping_area { 298struct mapping_area {
@@ -274,6 +305,28 @@ struct mapping_area {
274 enum zs_mapmode vm_mm; /* mapping mode */ 305 enum zs_mapmode vm_mm; /* mapping mode */
275}; 306};
276 307
308#ifdef CONFIG_COMPACTION
309static int zs_register_migration(struct zs_pool *pool);
310static void zs_unregister_migration(struct zs_pool *pool);
311static void migrate_lock_init(struct zspage *zspage);
312static void migrate_read_lock(struct zspage *zspage);
313static void migrate_read_unlock(struct zspage *zspage);
314static void kick_deferred_free(struct zs_pool *pool);
315static void init_deferred_free(struct zs_pool *pool);
316static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
317#else
318static int zsmalloc_mount(void) { return 0; }
319static void zsmalloc_unmount(void) {}
320static int zs_register_migration(struct zs_pool *pool) { return 0; }
321static void zs_unregister_migration(struct zs_pool *pool) {}
322static void migrate_lock_init(struct zspage *zspage) {}
323static void migrate_read_lock(struct zspage *zspage) {}
324static void migrate_read_unlock(struct zspage *zspage) {}
325static void kick_deferred_free(struct zs_pool *pool) {}
326static void init_deferred_free(struct zs_pool *pool) {}
327static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
328#endif
329
277static int create_cache(struct zs_pool *pool) 330static int create_cache(struct zs_pool *pool)
278{ 331{
279 pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 332 pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
@@ -301,7 +354,7 @@ static void destroy_cache(struct zs_pool *pool)
301static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) 354static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
302{ 355{
303 return (unsigned long)kmem_cache_alloc(pool->handle_cachep, 356 return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
304 gfp & ~__GFP_HIGHMEM); 357 gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
305} 358}
306 359
307static void cache_free_handle(struct zs_pool *pool, unsigned long handle) 360static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
@@ -311,7 +364,8 @@ static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
311 364
312static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) 365static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
313{ 366{
314 return kmem_cache_alloc(pool->zspage_cachep, flags & ~__GFP_HIGHMEM); 367 return kmem_cache_alloc(pool->zspage_cachep,
368 flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
315}; 369};
316 370
317static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) 371static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
@@ -421,11 +475,17 @@ static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
421/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ 475/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
422static DEFINE_PER_CPU(struct mapping_area, zs_map_area); 476static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
423 477
478static bool is_zspage_isolated(struct zspage *zspage)
479{
480 return zspage->isolated;
481}
482
424static int is_first_page(struct page *page) 483static int is_first_page(struct page *page)
425{ 484{
426 return PagePrivate(page); 485 return PagePrivate(page);
427} 486}
428 487
488/* Protected by class->lock */
429static inline int get_zspage_inuse(struct zspage *zspage) 489static inline int get_zspage_inuse(struct zspage *zspage)
430{ 490{
431 return zspage->inuse; 491 return zspage->inuse;
@@ -441,20 +501,22 @@ static inline void mod_zspage_inuse(struct zspage *zspage, int val)
441 zspage->inuse += val; 501 zspage->inuse += val;
442} 502}
443 503
444static inline int get_first_obj_offset(struct page *page) 504static inline struct page *get_first_page(struct zspage *zspage)
445{ 505{
446 if (is_first_page(page)) 506 struct page *first_page = zspage->first_page;
447 return 0;
448 507
449 return page->index; 508 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
509 return first_page;
450} 510}
451 511
452static inline void set_first_obj_offset(struct page *page, int offset) 512static inline int get_first_obj_offset(struct page *page)
453{ 513{
454 if (is_first_page(page)) 514 return page->units;
455 return; 515}
456 516
457 page->index = offset; 517static inline void set_first_obj_offset(struct page *page, int offset)
518{
519 page->units = offset;
458} 520}
459 521
460static inline unsigned int get_freeobj(struct zspage *zspage) 522static inline unsigned int get_freeobj(struct zspage *zspage)
@@ -471,6 +533,8 @@ static void get_zspage_mapping(struct zspage *zspage,
471 unsigned int *class_idx, 533 unsigned int *class_idx,
472 enum fullness_group *fullness) 534 enum fullness_group *fullness)
473{ 535{
536 BUG_ON(zspage->magic != ZSPAGE_MAGIC);
537
474 *fullness = zspage->fullness; 538 *fullness = zspage->fullness;
475 *class_idx = zspage->class; 539 *class_idx = zspage->class;
476} 540}
@@ -504,23 +568,19 @@ static int get_size_class_index(int size)
504static inline void zs_stat_inc(struct size_class *class, 568static inline void zs_stat_inc(struct size_class *class,
505 enum zs_stat_type type, unsigned long cnt) 569 enum zs_stat_type type, unsigned long cnt)
506{ 570{
507 if (type < NR_ZS_STAT_TYPE) 571 class->stats.objs[type] += cnt;
508 class->stats.objs[type] += cnt;
509} 572}
510 573
511static inline void zs_stat_dec(struct size_class *class, 574static inline void zs_stat_dec(struct size_class *class,
512 enum zs_stat_type type, unsigned long cnt) 575 enum zs_stat_type type, unsigned long cnt)
513{ 576{
514 if (type < NR_ZS_STAT_TYPE) 577 class->stats.objs[type] -= cnt;
515 class->stats.objs[type] -= cnt;
516} 578}
517 579
518static inline unsigned long zs_stat_get(struct size_class *class, 580static inline unsigned long zs_stat_get(struct size_class *class,
519 enum zs_stat_type type) 581 enum zs_stat_type type)
520{ 582{
521 if (type < NR_ZS_STAT_TYPE) 583 return class->stats.objs[type];
522 return class->stats.objs[type];
523 return 0;
524} 584}
525 585
526#ifdef CONFIG_ZSMALLOC_STAT 586#ifdef CONFIG_ZSMALLOC_STAT
@@ -664,6 +724,7 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool)
664} 724}
665#endif 725#endif
666 726
727
667/* 728/*
668 * For each size class, zspages are divided into different groups 729 * For each size class, zspages are divided into different groups
669 * depending on how "full" they are. This was done so that we could 730 * depending on how "full" they are. This was done so that we could
@@ -704,15 +765,9 @@ static void insert_zspage(struct size_class *class,
704{ 765{
705 struct zspage *head; 766 struct zspage *head;
706 767
707 if (fullness >= ZS_EMPTY) 768 zs_stat_inc(class, fullness, 1);
708 return;
709
710 head = list_first_entry_or_null(&class->fullness_list[fullness], 769 head = list_first_entry_or_null(&class->fullness_list[fullness],
711 struct zspage, list); 770 struct zspage, list);
712
713 zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
714 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
715
716 /* 771 /*
717 * We want to see more ZS_FULL pages and less almost empty/full. 772 * We want to see more ZS_FULL pages and less almost empty/full.
718 * Put pages with higher ->inuse first. 773 * Put pages with higher ->inuse first.
@@ -734,14 +789,11 @@ static void remove_zspage(struct size_class *class,
734 struct zspage *zspage, 789 struct zspage *zspage,
735 enum fullness_group fullness) 790 enum fullness_group fullness)
736{ 791{
737 if (fullness >= ZS_EMPTY)
738 return;
739
740 VM_BUG_ON(list_empty(&class->fullness_list[fullness])); 792 VM_BUG_ON(list_empty(&class->fullness_list[fullness]));
793 VM_BUG_ON(is_zspage_isolated(zspage));
741 794
742 list_del_init(&zspage->list); 795 list_del_init(&zspage->list);
743 zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? 796 zs_stat_dec(class, fullness, 1);
744 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
745} 797}
746 798
747/* 799/*
@@ -764,8 +816,11 @@ static enum fullness_group fix_fullness_group(struct size_class *class,
764 if (newfg == currfg) 816 if (newfg == currfg)
765 goto out; 817 goto out;
766 818
767 remove_zspage(class, zspage, currfg); 819 if (!is_zspage_isolated(zspage)) {
768 insert_zspage(class, zspage, newfg); 820 remove_zspage(class, zspage, currfg);
821 insert_zspage(class, zspage, newfg);
822 }
823
769 set_zspage_mapping(zspage, class_idx, newfg); 824 set_zspage_mapping(zspage, class_idx, newfg);
770 825
771out: 826out:
@@ -808,19 +863,20 @@ static int get_pages_per_zspage(int class_size)
808 return max_usedpc_order; 863 return max_usedpc_order;
809} 864}
810 865
811static struct page *get_first_page(struct zspage *zspage)
812{
813 return zspage->first_page;
814}
815
816static struct zspage *get_zspage(struct page *page) 866static struct zspage *get_zspage(struct page *page)
817{ 867{
818 return (struct zspage *)page->private; 868 struct zspage *zspage = (struct zspage *)page->private;
869
870 BUG_ON(zspage->magic != ZSPAGE_MAGIC);
871 return zspage;
819} 872}
820 873
821static struct page *get_next_page(struct page *page) 874static struct page *get_next_page(struct page *page)
822{ 875{
823 return page->next; 876 if (unlikely(PageHugeObject(page)))
877 return NULL;
878
879 return page->freelist;
824} 880}
825 881
826/** 882/**
@@ -857,16 +913,20 @@ static unsigned long handle_to_obj(unsigned long handle)
857 return *(unsigned long *)handle; 913 return *(unsigned long *)handle;
858} 914}
859 915
860static unsigned long obj_to_head(struct size_class *class, struct page *page, 916static unsigned long obj_to_head(struct page *page, void *obj)
861 void *obj)
862{ 917{
863 if (class->huge) { 918 if (unlikely(PageHugeObject(page))) {
864 VM_BUG_ON_PAGE(!is_first_page(page), page); 919 VM_BUG_ON_PAGE(!is_first_page(page), page);
865 return page->index; 920 return page->index;
866 } else 921 } else
867 return *(unsigned long *)obj; 922 return *(unsigned long *)obj;
868} 923}
869 924
925static inline int testpin_tag(unsigned long handle)
926{
927 return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
928}
929
870static inline int trypin_tag(unsigned long handle) 930static inline int trypin_tag(unsigned long handle)
871{ 931{
872 return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); 932 return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
@@ -884,27 +944,94 @@ static void unpin_tag(unsigned long handle)
884 944
885static void reset_page(struct page *page) 945static void reset_page(struct page *page)
886{ 946{
947 __ClearPageMovable(page);
887 clear_bit(PG_private, &page->flags); 948 clear_bit(PG_private, &page->flags);
888 clear_bit(PG_private_2, &page->flags); 949 clear_bit(PG_private_2, &page->flags);
889 set_page_private(page, 0); 950 set_page_private(page, 0);
890 page->index = 0; 951 page_mapcount_reset(page);
952 ClearPageHugeObject(page);
953 page->freelist = NULL;
954}
955
956/*
957 * To prevent zspage destroy during migration, zspage freeing should
958 * hold locks of all pages in the zspage.
959 */
960void lock_zspage(struct zspage *zspage)
961{
962 struct page *page = get_first_page(zspage);
963
964 do {
965 lock_page(page);
966 } while ((page = get_next_page(page)) != NULL);
967}
968
969int trylock_zspage(struct zspage *zspage)
970{
971 struct page *cursor, *fail;
972
973 for (cursor = get_first_page(zspage); cursor != NULL; cursor =
974 get_next_page(cursor)) {
975 if (!trylock_page(cursor)) {
976 fail = cursor;
977 goto unlock;
978 }
979 }
980
981 return 1;
982unlock:
983 for (cursor = get_first_page(zspage); cursor != fail; cursor =
984 get_next_page(cursor))
985 unlock_page(cursor);
986
987 return 0;
891} 988}
892 989
893static void free_zspage(struct zs_pool *pool, struct zspage *zspage) 990static void __free_zspage(struct zs_pool *pool, struct size_class *class,
991 struct zspage *zspage)
894{ 992{
895 struct page *page, *next; 993 struct page *page, *next;
994 enum fullness_group fg;
995 unsigned int class_idx;
996
997 get_zspage_mapping(zspage, &class_idx, &fg);
998
999 assert_spin_locked(&class->lock);
896 1000
897 VM_BUG_ON(get_zspage_inuse(zspage)); 1001 VM_BUG_ON(get_zspage_inuse(zspage));
1002 VM_BUG_ON(fg != ZS_EMPTY);
898 1003
899 next = page = zspage->first_page; 1004 next = page = get_first_page(zspage);
900 do { 1005 do {
901 next = page->next; 1006 VM_BUG_ON_PAGE(!PageLocked(page), page);
1007 next = get_next_page(page);
902 reset_page(page); 1008 reset_page(page);
1009 unlock_page(page);
903 put_page(page); 1010 put_page(page);
904 page = next; 1011 page = next;
905 } while (page != NULL); 1012 } while (page != NULL);
906 1013
907 cache_free_zspage(pool, zspage); 1014 cache_free_zspage(pool, zspage);
1015
1016 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
1017 class->size, class->pages_per_zspage));
1018 atomic_long_sub(class->pages_per_zspage,
1019 &pool->pages_allocated);
1020}
1021
1022static void free_zspage(struct zs_pool *pool, struct size_class *class,
1023 struct zspage *zspage)
1024{
1025 VM_BUG_ON(get_zspage_inuse(zspage));
1026 VM_BUG_ON(list_empty(&zspage->list));
1027
1028 if (!trylock_zspage(zspage)) {
1029 kick_deferred_free(pool);
1030 return;
1031 }
1032
1033 remove_zspage(class, zspage, ZS_EMPTY);
1034 __free_zspage(pool, class, zspage);
908} 1035}
909 1036
910/* Initialize a newly allocated zspage */ 1037/* Initialize a newly allocated zspage */
@@ -912,7 +1039,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
912{ 1039{
913 unsigned int freeobj = 1; 1040 unsigned int freeobj = 1;
914 unsigned long off = 0; 1041 unsigned long off = 0;
915 struct page *page = zspage->first_page; 1042 struct page *page = get_first_page(zspage);
916 1043
917 while (page) { 1044 while (page) {
918 struct page *next_page; 1045 struct page *next_page;
@@ -952,16 +1079,17 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
952 set_freeobj(zspage, 0); 1079 set_freeobj(zspage, 0);
953} 1080}
954 1081
955static void create_page_chain(struct zspage *zspage, struct page *pages[], 1082static void create_page_chain(struct size_class *class, struct zspage *zspage,
956 int nr_pages) 1083 struct page *pages[])
957{ 1084{
958 int i; 1085 int i;
959 struct page *page; 1086 struct page *page;
960 struct page *prev_page = NULL; 1087 struct page *prev_page = NULL;
1088 int nr_pages = class->pages_per_zspage;
961 1089
962 /* 1090 /*
963 * Allocate individual pages and link them together as: 1091 * Allocate individual pages and link them together as:
964 * 1. all pages are linked together using page->next 1092 * 1. all pages are linked together using page->freelist
965 * 2. each sub-page point to zspage using page->private 1093 * 2. each sub-page point to zspage using page->private
966 * 1094 *
967 * we set PG_private to identify the first page (i.e. no other sub-page 1095 * we set PG_private to identify the first page (i.e. no other sub-page
@@ -970,16 +1098,18 @@ static void create_page_chain(struct zspage *zspage, struct page *pages[],
970 for (i = 0; i < nr_pages; i++) { 1098 for (i = 0; i < nr_pages; i++) {
971 page = pages[i]; 1099 page = pages[i];
972 set_page_private(page, (unsigned long)zspage); 1100 set_page_private(page, (unsigned long)zspage);
1101 page->freelist = NULL;
973 if (i == 0) { 1102 if (i == 0) {
974 zspage->first_page = page; 1103 zspage->first_page = page;
975 SetPagePrivate(page); 1104 SetPagePrivate(page);
1105 if (unlikely(class->objs_per_zspage == 1 &&
1106 class->pages_per_zspage == 1))
1107 SetPageHugeObject(page);
976 } else { 1108 } else {
977 prev_page->next = page; 1109 prev_page->freelist = page;
978 } 1110 }
979 if (i == nr_pages - 1) { 1111 if (i == nr_pages - 1)
980 SetPagePrivate2(page); 1112 SetPagePrivate2(page);
981 page->next = NULL;
982 }
983 prev_page = page; 1113 prev_page = page;
984 } 1114 }
985} 1115}
@@ -999,6 +1129,8 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
999 return NULL; 1129 return NULL;
1000 1130
1001 memset(zspage, 0, sizeof(struct zspage)); 1131 memset(zspage, 0, sizeof(struct zspage));
1132 zspage->magic = ZSPAGE_MAGIC;
1133 migrate_lock_init(zspage);
1002 1134
1003 for (i = 0; i < class->pages_per_zspage; i++) { 1135 for (i = 0; i < class->pages_per_zspage; i++) {
1004 struct page *page; 1136 struct page *page;
@@ -1013,7 +1145,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
1013 pages[i] = page; 1145 pages[i] = page;
1014 } 1146 }
1015 1147
1016 create_page_chain(zspage, pages, class->pages_per_zspage); 1148 create_page_chain(class, zspage, pages);
1017 init_zspage(class, zspage); 1149 init_zspage(class, zspage);
1018 1150
1019 return zspage; 1151 return zspage;
@@ -1024,7 +1156,7 @@ static struct zspage *find_get_zspage(struct size_class *class)
1024 int i; 1156 int i;
1025 struct zspage *zspage; 1157 struct zspage *zspage;
1026 1158
1027 for (i = ZS_ALMOST_FULL; i <= ZS_ALMOST_EMPTY; i++) { 1159 for (i = ZS_ALMOST_FULL; i >= ZS_EMPTY; i--) {
1028 zspage = list_first_entry_or_null(&class->fullness_list[i], 1160 zspage = list_first_entry_or_null(&class->fullness_list[i],
1029 struct zspage, list); 1161 struct zspage, list);
1030 if (zspage) 1162 if (zspage)
@@ -1289,6 +1421,10 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1289 obj = handle_to_obj(handle); 1421 obj = handle_to_obj(handle);
1290 obj_to_location(obj, &page, &obj_idx); 1422 obj_to_location(obj, &page, &obj_idx);
1291 zspage = get_zspage(page); 1423 zspage = get_zspage(page);
1424
1425 /* migration cannot move any subpage in this zspage */
1426 migrate_read_lock(zspage);
1427
1292 get_zspage_mapping(zspage, &class_idx, &fg); 1428 get_zspage_mapping(zspage, &class_idx, &fg);
1293 class = pool->size_class[class_idx]; 1429 class = pool->size_class[class_idx];
1294 off = (class->size * obj_idx) & ~PAGE_MASK; 1430 off = (class->size * obj_idx) & ~PAGE_MASK;
@@ -1309,7 +1445,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1309 1445
1310 ret = __zs_map_object(area, pages, off, class->size); 1446 ret = __zs_map_object(area, pages, off, class->size);
1311out: 1447out:
1312 if (!class->huge) 1448 if (likely(!PageHugeObject(page)))
1313 ret += ZS_HANDLE_SIZE; 1449 ret += ZS_HANDLE_SIZE;
1314 1450
1315 return ret; 1451 return ret;
@@ -1348,6 +1484,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1348 __zs_unmap_object(area, pages, off, class->size); 1484 __zs_unmap_object(area, pages, off, class->size);
1349 } 1485 }
1350 put_cpu_var(zs_map_area); 1486 put_cpu_var(zs_map_area);
1487
1488 migrate_read_unlock(zspage);
1351 unpin_tag(handle); 1489 unpin_tag(handle);
1352} 1490}
1353EXPORT_SYMBOL_GPL(zs_unmap_object); 1491EXPORT_SYMBOL_GPL(zs_unmap_object);
@@ -1377,7 +1515,7 @@ static unsigned long obj_malloc(struct size_class *class,
1377 vaddr = kmap_atomic(m_page); 1515 vaddr = kmap_atomic(m_page);
1378 link = (struct link_free *)vaddr + m_offset / sizeof(*link); 1516 link = (struct link_free *)vaddr + m_offset / sizeof(*link);
1379 set_freeobj(zspage, link->next >> OBJ_ALLOCATED_TAG); 1517 set_freeobj(zspage, link->next >> OBJ_ALLOCATED_TAG);
1380 if (!class->huge) 1518 if (likely(!PageHugeObject(m_page)))
1381 /* record handle in the header of allocated chunk */ 1519 /* record handle in the header of allocated chunk */
1382 link->handle = handle; 1520 link->handle = handle;
1383 else 1521 else
@@ -1407,6 +1545,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
1407{ 1545{
1408 unsigned long handle, obj; 1546 unsigned long handle, obj;
1409 struct size_class *class; 1547 struct size_class *class;
1548 enum fullness_group newfg;
1410 struct zspage *zspage; 1549 struct zspage *zspage;
1411 1550
1412 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) 1551 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
@@ -1422,28 +1561,37 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
1422 1561
1423 spin_lock(&class->lock); 1562 spin_lock(&class->lock);
1424 zspage = find_get_zspage(class); 1563 zspage = find_get_zspage(class);
1425 1564 if (likely(zspage)) {
1426 if (!zspage) { 1565 obj = obj_malloc(class, zspage, handle);
1566 /* Now move the zspage to another fullness group, if required */
1567 fix_fullness_group(class, zspage);
1568 record_obj(handle, obj);
1427 spin_unlock(&class->lock); 1569 spin_unlock(&class->lock);
1428 zspage = alloc_zspage(pool, class, gfp);
1429 if (unlikely(!zspage)) {
1430 cache_free_handle(pool, handle);
1431 return 0;
1432 }
1433 1570
1434 set_zspage_mapping(zspage, class->index, ZS_EMPTY); 1571 return handle;
1435 atomic_long_add(class->pages_per_zspage, 1572 }
1436 &pool->pages_allocated);
1437 1573
1438 spin_lock(&class->lock); 1574 spin_unlock(&class->lock);
1439 zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1575
1440 class->size, class->pages_per_zspage)); 1576 zspage = alloc_zspage(pool, class, gfp);
1577 if (!zspage) {
1578 cache_free_handle(pool, handle);
1579 return 0;
1441 } 1580 }
1442 1581
1582 spin_lock(&class->lock);
1443 obj = obj_malloc(class, zspage, handle); 1583 obj = obj_malloc(class, zspage, handle);
1444 /* Now move the zspage to another fullness group, if required */ 1584 newfg = get_fullness_group(class, zspage);
1445 fix_fullness_group(class, zspage); 1585 insert_zspage(class, zspage, newfg);
1586 set_zspage_mapping(zspage, class->index, newfg);
1446 record_obj(handle, obj); 1587 record_obj(handle, obj);
1588 atomic_long_add(class->pages_per_zspage,
1589 &pool->pages_allocated);
1590 zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
1591 class->size, class->pages_per_zspage));
1592
1593 /* We completely set up zspage so mark them as movable */
1594 SetZsPageMovable(pool, zspage);
1447 spin_unlock(&class->lock); 1595 spin_unlock(&class->lock);
1448 1596
1449 return handle; 1597 return handle;
@@ -1484,6 +1632,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
1484 int class_idx; 1632 int class_idx;
1485 struct size_class *class; 1633 struct size_class *class;
1486 enum fullness_group fullness; 1634 enum fullness_group fullness;
1635 bool isolated;
1487 1636
1488 if (unlikely(!handle)) 1637 if (unlikely(!handle))
1489 return; 1638 return;
@@ -1493,22 +1642,28 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
1493 obj_to_location(obj, &f_page, &f_objidx); 1642 obj_to_location(obj, &f_page, &f_objidx);
1494 zspage = get_zspage(f_page); 1643 zspage = get_zspage(f_page);
1495 1644
1645 migrate_read_lock(zspage);
1646
1496 get_zspage_mapping(zspage, &class_idx, &fullness); 1647 get_zspage_mapping(zspage, &class_idx, &fullness);
1497 class = pool->size_class[class_idx]; 1648 class = pool->size_class[class_idx];
1498 1649
1499 spin_lock(&class->lock); 1650 spin_lock(&class->lock);
1500 obj_free(class, obj); 1651 obj_free(class, obj);
1501 fullness = fix_fullness_group(class, zspage); 1652 fullness = fix_fullness_group(class, zspage);
1502 if (fullness == ZS_EMPTY) { 1653 if (fullness != ZS_EMPTY) {
1503 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1654 migrate_read_unlock(zspage);
1504 class->size, class->pages_per_zspage)); 1655 goto out;
1505 atomic_long_sub(class->pages_per_zspage,
1506 &pool->pages_allocated);
1507 free_zspage(pool, zspage);
1508 } 1656 }
1657
1658 isolated = is_zspage_isolated(zspage);
1659 migrate_read_unlock(zspage);
1660 /* If zspage is isolated, zs_page_putback will free the zspage */
1661 if (likely(!isolated))
1662 free_zspage(pool, class, zspage);
1663out:
1664
1509 spin_unlock(&class->lock); 1665 spin_unlock(&class->lock);
1510 unpin_tag(handle); 1666 unpin_tag(handle);
1511
1512 cache_free_handle(pool, handle); 1667 cache_free_handle(pool, handle);
1513} 1668}
1514EXPORT_SYMBOL_GPL(zs_free); 1669EXPORT_SYMBOL_GPL(zs_free);
@@ -1592,7 +1747,7 @@ static unsigned long find_alloced_obj(struct size_class *class,
1592 offset += class->size * index; 1747 offset += class->size * index;
1593 1748
1594 while (offset < PAGE_SIZE) { 1749 while (offset < PAGE_SIZE) {
1595 head = obj_to_head(class, page, addr + offset); 1750 head = obj_to_head(page, addr + offset);
1596 if (head & OBJ_ALLOCATED_TAG) { 1751 if (head & OBJ_ALLOCATED_TAG) {
1597 handle = head & ~OBJ_ALLOCATED_TAG; 1752 handle = head & ~OBJ_ALLOCATED_TAG;
1598 if (trypin_tag(handle)) 1753 if (trypin_tag(handle))
@@ -1684,6 +1839,7 @@ static struct zspage *isolate_zspage(struct size_class *class, bool source)
1684 zspage = list_first_entry_or_null(&class->fullness_list[fg[i]], 1839 zspage = list_first_entry_or_null(&class->fullness_list[fg[i]],
1685 struct zspage, list); 1840 struct zspage, list);
1686 if (zspage) { 1841 if (zspage) {
1842 VM_BUG_ON(is_zspage_isolated(zspage));
1687 remove_zspage(class, zspage, fg[i]); 1843 remove_zspage(class, zspage, fg[i]);
1688 return zspage; 1844 return zspage;
1689 } 1845 }
@@ -1704,6 +1860,8 @@ static enum fullness_group putback_zspage(struct size_class *class,
1704{ 1860{
1705 enum fullness_group fullness; 1861 enum fullness_group fullness;
1706 1862
1863 VM_BUG_ON(is_zspage_isolated(zspage));
1864
1707 fullness = get_fullness_group(class, zspage); 1865 fullness = get_fullness_group(class, zspage);
1708 insert_zspage(class, zspage, fullness); 1866 insert_zspage(class, zspage, fullness);
1709 set_zspage_mapping(zspage, class->index, fullness); 1867 set_zspage_mapping(zspage, class->index, fullness);
@@ -1711,6 +1869,378 @@ static enum fullness_group putback_zspage(struct size_class *class,
1711 return fullness; 1869 return fullness;
1712} 1870}
1713 1871
1872#ifdef CONFIG_COMPACTION
1873static struct dentry *zs_mount(struct file_system_type *fs_type,
1874 int flags, const char *dev_name, void *data)
1875{
1876 static const struct dentry_operations ops = {
1877 .d_dname = simple_dname,
1878 };
1879
1880 return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC);
1881}
1882
1883static struct file_system_type zsmalloc_fs = {
1884 .name = "zsmalloc",
1885 .mount = zs_mount,
1886 .kill_sb = kill_anon_super,
1887};
1888
1889static int zsmalloc_mount(void)
1890{
1891 int ret = 0;
1892
1893 zsmalloc_mnt = kern_mount(&zsmalloc_fs);
1894 if (IS_ERR(zsmalloc_mnt))
1895 ret = PTR_ERR(zsmalloc_mnt);
1896
1897 return ret;
1898}
1899
1900static void zsmalloc_unmount(void)
1901{
1902 kern_unmount(zsmalloc_mnt);
1903}
1904
1905static void migrate_lock_init(struct zspage *zspage)
1906{
1907 rwlock_init(&zspage->lock);
1908}
1909
1910static void migrate_read_lock(struct zspage *zspage)
1911{
1912 read_lock(&zspage->lock);
1913}
1914
1915static void migrate_read_unlock(struct zspage *zspage)
1916{
1917 read_unlock(&zspage->lock);
1918}
1919
1920static void migrate_write_lock(struct zspage *zspage)
1921{
1922 write_lock(&zspage->lock);
1923}
1924
1925static void migrate_write_unlock(struct zspage *zspage)
1926{
1927 write_unlock(&zspage->lock);
1928}
1929
1930/* Number of isolated subpage for *page migration* in this zspage */
1931static void inc_zspage_isolation(struct zspage *zspage)
1932{
1933 zspage->isolated++;
1934}
1935
1936static void dec_zspage_isolation(struct zspage *zspage)
1937{
1938 zspage->isolated--;
1939}
1940
1941static void replace_sub_page(struct size_class *class, struct zspage *zspage,
1942 struct page *newpage, struct page *oldpage)
1943{
1944 struct page *page;
1945 struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, };
1946 int idx = 0;
1947
1948 page = get_first_page(zspage);
1949 do {
1950 if (page == oldpage)
1951 pages[idx] = newpage;
1952 else
1953 pages[idx] = page;
1954 idx++;
1955 } while ((page = get_next_page(page)) != NULL);
1956
1957 create_page_chain(class, zspage, pages);
1958 set_first_obj_offset(newpage, get_first_obj_offset(oldpage));
1959 if (unlikely(PageHugeObject(oldpage)))
1960 newpage->index = oldpage->index;
1961 __SetPageMovable(newpage, page_mapping(oldpage));
1962}
1963
1964bool zs_page_isolate(struct page *page, isolate_mode_t mode)
1965{
1966 struct zs_pool *pool;
1967 struct size_class *class;
1968 int class_idx;
1969 enum fullness_group fullness;
1970 struct zspage *zspage;
1971 struct address_space *mapping;
1972
1973 /*
1974 * Page is locked so zspage couldn't be destroyed. For detail, look at
1975 * lock_zspage in free_zspage.
1976 */
1977 VM_BUG_ON_PAGE(!PageMovable(page), page);
1978 VM_BUG_ON_PAGE(PageIsolated(page), page);
1979
1980 zspage = get_zspage(page);
1981
1982 /*
1983 * Without class lock, fullness could be stale while class_idx is okay
1984 * because class_idx is constant unless page is freed so we should get
1985 * fullness again under class lock.
1986 */
1987 get_zspage_mapping(zspage, &class_idx, &fullness);
1988 mapping = page_mapping(page);
1989 pool = mapping->private_data;
1990 class = pool->size_class[class_idx];
1991
1992 spin_lock(&class->lock);
1993 if (get_zspage_inuse(zspage) == 0) {
1994 spin_unlock(&class->lock);
1995 return false;
1996 }
1997
1998 /* zspage is isolated for object migration */
1999 if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
2000 spin_unlock(&class->lock);
2001 return false;
2002 }
2003
2004 /*
2005 * If this is first time isolation for the zspage, isolate zspage from
2006 * size_class to prevent further object allocation from the zspage.
2007 */
2008 if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
2009 get_zspage_mapping(zspage, &class_idx, &fullness);
2010 remove_zspage(class, zspage, fullness);
2011 }
2012
2013 inc_zspage_isolation(zspage);
2014 spin_unlock(&class->lock);
2015
2016 return true;
2017}
2018
2019int zs_page_migrate(struct address_space *mapping, struct page *newpage,
2020 struct page *page, enum migrate_mode mode)
2021{
2022 struct zs_pool *pool;
2023 struct size_class *class;
2024 int class_idx;
2025 enum fullness_group fullness;
2026 struct zspage *zspage;
2027 struct page *dummy;
2028 void *s_addr, *d_addr, *addr;
2029 int offset, pos;
2030 unsigned long handle, head;
2031 unsigned long old_obj, new_obj;
2032 unsigned int obj_idx;
2033 int ret = -EAGAIN;
2034
2035 VM_BUG_ON_PAGE(!PageMovable(page), page);
2036 VM_BUG_ON_PAGE(!PageIsolated(page), page);
2037
2038 zspage = get_zspage(page);
2039
2040 /* Concurrent compactor cannot migrate any subpage in zspage */
2041 migrate_write_lock(zspage);
2042 get_zspage_mapping(zspage, &class_idx, &fullness);
2043 pool = mapping->private_data;
2044 class = pool->size_class[class_idx];
2045 offset = get_first_obj_offset(page);
2046
2047 spin_lock(&class->lock);
2048 if (!get_zspage_inuse(zspage)) {
2049 ret = -EBUSY;
2050 goto unlock_class;
2051 }
2052
2053 pos = offset;
2054 s_addr = kmap_atomic(page);
2055 while (pos < PAGE_SIZE) {
2056 head = obj_to_head(page, s_addr + pos);
2057 if (head & OBJ_ALLOCATED_TAG) {
2058 handle = head & ~OBJ_ALLOCATED_TAG;
2059 if (!trypin_tag(handle))
2060 goto unpin_objects;
2061 }
2062 pos += class->size;
2063 }
2064
2065 /*
2066 * Here, any user cannot access all objects in the zspage so let's move.
2067 */
2068 d_addr = kmap_atomic(newpage);
2069 memcpy(d_addr, s_addr, PAGE_SIZE);
2070 kunmap_atomic(d_addr);
2071
2072 for (addr = s_addr + offset; addr < s_addr + pos;
2073 addr += class->size) {
2074 head = obj_to_head(page, addr);
2075 if (head & OBJ_ALLOCATED_TAG) {
2076 handle = head & ~OBJ_ALLOCATED_TAG;
2077 if (!testpin_tag(handle))
2078 BUG();
2079
2080 old_obj = handle_to_obj(handle);
2081 obj_to_location(old_obj, &dummy, &obj_idx);
2082 new_obj = (unsigned long)location_to_obj(newpage,
2083 obj_idx);
2084 new_obj |= BIT(HANDLE_PIN_BIT);
2085 record_obj(handle, new_obj);
2086 }
2087 }
2088
2089 replace_sub_page(class, zspage, newpage, page);
2090 get_page(newpage);
2091
2092 dec_zspage_isolation(zspage);
2093
2094 /*
2095 * Page migration is done so let's putback isolated zspage to
2096 * the list if @page is final isolated subpage in the zspage.
2097 */
2098 if (!is_zspage_isolated(zspage))
2099 putback_zspage(class, zspage);
2100
2101 reset_page(page);
2102 put_page(page);
2103 page = newpage;
2104
2105 ret = 0;
2106unpin_objects:
2107 for (addr = s_addr + offset; addr < s_addr + pos;
2108 addr += class->size) {
2109 head = obj_to_head(page, addr);
2110 if (head & OBJ_ALLOCATED_TAG) {
2111 handle = head & ~OBJ_ALLOCATED_TAG;
2112 if (!testpin_tag(handle))
2113 BUG();
2114 unpin_tag(handle);
2115 }
2116 }
2117 kunmap_atomic(s_addr);
2118unlock_class:
2119 spin_unlock(&class->lock);
2120 migrate_write_unlock(zspage);
2121
2122 return ret;
2123}
2124
2125void zs_page_putback(struct page *page)
2126{
2127 struct zs_pool *pool;
2128 struct size_class *class;
2129 int class_idx;
2130 enum fullness_group fg;
2131 struct address_space *mapping;
2132 struct zspage *zspage;
2133
2134 VM_BUG_ON_PAGE(!PageMovable(page), page);
2135 VM_BUG_ON_PAGE(!PageIsolated(page), page);
2136
2137 zspage = get_zspage(page);
2138 get_zspage_mapping(zspage, &class_idx, &fg);
2139 mapping = page_mapping(page);
2140 pool = mapping->private_data;
2141 class = pool->size_class[class_idx];
2142
2143 spin_lock(&class->lock);
2144 dec_zspage_isolation(zspage);
2145 if (!is_zspage_isolated(zspage)) {
2146 fg = putback_zspage(class, zspage);
2147 /*
2148 * Due to page_lock, we cannot free zspage immediately
2149 * so let's defer.
2150 */
2151 if (fg == ZS_EMPTY)
2152 schedule_work(&pool->free_work);
2153 }
2154 spin_unlock(&class->lock);
2155}
2156
2157const struct address_space_operations zsmalloc_aops = {
2158 .isolate_page = zs_page_isolate,
2159 .migratepage = zs_page_migrate,
2160 .putback_page = zs_page_putback,
2161};
2162
2163static int zs_register_migration(struct zs_pool *pool)
2164{
2165 pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb);
2166 if (IS_ERR(pool->inode)) {
2167 pool->inode = NULL;
2168 return 1;
2169 }
2170
2171 pool->inode->i_mapping->private_data = pool;
2172 pool->inode->i_mapping->a_ops = &zsmalloc_aops;
2173 return 0;
2174}
2175
2176static void zs_unregister_migration(struct zs_pool *pool)
2177{
2178 flush_work(&pool->free_work);
2179 if (pool->inode)
2180 iput(pool->inode);
2181}
2182
2183/*
2184 * Caller should hold page_lock of all pages in the zspage
2185 * In here, we cannot use zspage meta data.
2186 */
2187static void async_free_zspage(struct work_struct *work)
2188{
2189 int i;
2190 struct size_class *class;
2191 unsigned int class_idx;
2192 enum fullness_group fullness;
2193 struct zspage *zspage, *tmp;
2194 LIST_HEAD(free_pages);
2195 struct zs_pool *pool = container_of(work, struct zs_pool,
2196 free_work);
2197
2198 for (i = 0; i < zs_size_classes; i++) {
2199 class = pool->size_class[i];
2200 if (class->index != i)
2201 continue;
2202
2203 spin_lock(&class->lock);
2204 list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages);
2205 spin_unlock(&class->lock);
2206 }
2207
2208
2209 list_for_each_entry_safe(zspage, tmp, &free_pages, list) {
2210 list_del(&zspage->list);
2211 lock_zspage(zspage);
2212
2213 get_zspage_mapping(zspage, &class_idx, &fullness);
2214 VM_BUG_ON(fullness != ZS_EMPTY);
2215 class = pool->size_class[class_idx];
2216 spin_lock(&class->lock);
2217 __free_zspage(pool, pool->size_class[class_idx], zspage);
2218 spin_unlock(&class->lock);
2219 }
2220};
2221
2222static void kick_deferred_free(struct zs_pool *pool)
2223{
2224 schedule_work(&pool->free_work);
2225}
2226
2227static void init_deferred_free(struct zs_pool *pool)
2228{
2229 INIT_WORK(&pool->free_work, async_free_zspage);
2230}
2231
2232static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
2233{
2234 struct page *page = get_first_page(zspage);
2235
2236 do {
2237 WARN_ON(!trylock_page(page));
2238 __SetPageMovable(page, pool->inode->i_mapping);
2239 unlock_page(page);
2240 } while ((page = get_next_page(page)) != NULL);
2241}
2242#endif
2243
1714/* 2244/*
1715 * 2245 *
1716 * Based on the number of unused allocated objects calculate 2246 * Based on the number of unused allocated objects calculate
@@ -1745,10 +2275,10 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
1745 break; 2275 break;
1746 2276
1747 cc.index = 0; 2277 cc.index = 0;
1748 cc.s_page = src_zspage->first_page; 2278 cc.s_page = get_first_page(src_zspage);
1749 2279
1750 while ((dst_zspage = isolate_zspage(class, false))) { 2280 while ((dst_zspage = isolate_zspage(class, false))) {
1751 cc.d_page = dst_zspage->first_page; 2281 cc.d_page = get_first_page(dst_zspage);
1752 /* 2282 /*
1753 * If there is no more space in dst_page, resched 2283 * If there is no more space in dst_page, resched
1754 * and see if anyone had allocated another zspage. 2284 * and see if anyone had allocated another zspage.
@@ -1765,11 +2295,7 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
1765 2295
1766 putback_zspage(class, dst_zspage); 2296 putback_zspage(class, dst_zspage);
1767 if (putback_zspage(class, src_zspage) == ZS_EMPTY) { 2297 if (putback_zspage(class, src_zspage) == ZS_EMPTY) {
1768 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 2298 free_zspage(pool, class, src_zspage);
1769 class->size, class->pages_per_zspage));
1770 atomic_long_sub(class->pages_per_zspage,
1771 &pool->pages_allocated);
1772 free_zspage(pool, src_zspage);
1773 pool->stats.pages_compacted += class->pages_per_zspage; 2299 pool->stats.pages_compacted += class->pages_per_zspage;
1774 } 2300 }
1775 spin_unlock(&class->lock); 2301 spin_unlock(&class->lock);
@@ -1885,6 +2411,7 @@ struct zs_pool *zs_create_pool(const char *name)
1885 if (!pool) 2411 if (!pool)
1886 return NULL; 2412 return NULL;
1887 2413
2414 init_deferred_free(pool);
1888 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), 2415 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
1889 GFP_KERNEL); 2416 GFP_KERNEL);
1890 if (!pool->size_class) { 2417 if (!pool->size_class) {
@@ -1939,12 +2466,10 @@ struct zs_pool *zs_create_pool(const char *name)
1939 class->pages_per_zspage = pages_per_zspage; 2466 class->pages_per_zspage = pages_per_zspage;
1940 class->objs_per_zspage = class->pages_per_zspage * 2467 class->objs_per_zspage = class->pages_per_zspage *
1941 PAGE_SIZE / class->size; 2468 PAGE_SIZE / class->size;
1942 if (pages_per_zspage == 1 && class->objs_per_zspage == 1)
1943 class->huge = true;
1944 spin_lock_init(&class->lock); 2469 spin_lock_init(&class->lock);
1945 pool->size_class[i] = class; 2470 pool->size_class[i] = class;
1946 for (fullness = ZS_ALMOST_FULL; fullness <= ZS_ALMOST_EMPTY; 2471 for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
1947 fullness++) 2472 fullness++)
1948 INIT_LIST_HEAD(&class->fullness_list[fullness]); 2473 INIT_LIST_HEAD(&class->fullness_list[fullness]);
1949 2474
1950 prev_class = class; 2475 prev_class = class;
@@ -1953,6 +2478,9 @@ struct zs_pool *zs_create_pool(const char *name)
1953 /* debug only, don't abort if it fails */ 2478 /* debug only, don't abort if it fails */
1954 zs_pool_stat_create(pool, name); 2479 zs_pool_stat_create(pool, name);
1955 2480
2481 if (zs_register_migration(pool))
2482 goto err;
2483
1956 /* 2484 /*
1957 * Not critical, we still can use the pool 2485 * Not critical, we still can use the pool
1958 * and user can trigger compaction manually. 2486 * and user can trigger compaction manually.
@@ -1972,6 +2500,7 @@ void zs_destroy_pool(struct zs_pool *pool)
1972 int i; 2500 int i;
1973 2501
1974 zs_unregister_shrinker(pool); 2502 zs_unregister_shrinker(pool);
2503 zs_unregister_migration(pool);
1975 zs_pool_stat_destroy(pool); 2504 zs_pool_stat_destroy(pool);
1976 2505
1977 for (i = 0; i < zs_size_classes; i++) { 2506 for (i = 0; i < zs_size_classes; i++) {
@@ -1984,7 +2513,7 @@ void zs_destroy_pool(struct zs_pool *pool)
1984 if (class->index != i) 2513 if (class->index != i)
1985 continue; 2514 continue;
1986 2515
1987 for (fg = ZS_ALMOST_FULL; fg <= ZS_ALMOST_EMPTY; fg++) { 2516 for (fg = ZS_EMPTY; fg < NR_ZS_FULLNESS; fg++) {
1988 if (!list_empty(&class->fullness_list[fg])) { 2517 if (!list_empty(&class->fullness_list[fg])) {
1989 pr_info("Freeing non-empty class with size %db, fullness group %d\n", 2518 pr_info("Freeing non-empty class with size %db, fullness group %d\n",
1990 class->size, fg); 2519 class->size, fg);
@@ -2002,7 +2531,13 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool);
2002 2531
2003static int __init zs_init(void) 2532static int __init zs_init(void)
2004{ 2533{
2005 int ret = zs_register_cpu_notifier(); 2534 int ret;
2535
2536 ret = zsmalloc_mount();
2537 if (ret)
2538 goto out;
2539
2540 ret = zs_register_cpu_notifier();
2006 2541
2007 if (ret) 2542 if (ret)
2008 goto notifier_fail; 2543 goto notifier_fail;
@@ -2019,7 +2554,8 @@ static int __init zs_init(void)
2019 2554
2020notifier_fail: 2555notifier_fail:
2021 zs_unregister_cpu_notifier(); 2556 zs_unregister_cpu_notifier();
2022 2557 zsmalloc_unmount();
2558out:
2023 return ret; 2559 return ret;
2024} 2560}
2025 2561
@@ -2028,6 +2564,7 @@ static void __exit zs_exit(void)
2028#ifdef CONFIG_ZPOOL 2564#ifdef CONFIG_ZPOOL
2029 zpool_unregister_driver(&zs_zpool_driver); 2565 zpool_unregister_driver(&zs_zpool_driver);
2030#endif 2566#endif
2567 zsmalloc_unmount();
2031 zs_unregister_cpu_notifier(); 2568 zs_unregister_cpu_notifier();
2032 2569
2033 zs_stat_exit(); 2570 zs_stat_exit();