UBIFS: fully sort GCed nodes

The 'joinup()' function cannot deal with situations when nodes go in reverse order - it just leaves them in this order. This patch implement full nodes sorting using n*log(n) algorithm. It sorts data nodes for bulk-read, and direntry nodes for readdir(). Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
author: Artem Bityutskiy <Artem.Bityutskiy@nokia.com> 2009-03-08 09:13:00 -0400
committer: Artem Bityutskiy <Artem.Bityutskiy@nokia.com> 2009-03-20 13:12:00 -0400
commit: f10770f5e56b4297701fd7c3e551b206f98d7ac2 (patch)
tree: 5fad34defa002857fca21dfde03e3813d68cfb4b /fs/ubifs
parent: 7d4e9ccb435e51e013e63abd340b4f496428139c (diff)
1 files changed, 296 insertions, 132 deletions
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index a711d33b3d3..f0f5f15d384 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -47,7 +47,7 @@
 * have to waste large pieces of free space at the end of LEB B, because nodes
 * from LEB A would not fit. And the worst situation is when all nodes are of
 * maximum size. So dark watermark is the amount of free + dirty space in LEB
- * which are guaranteed to be reclaimable. If LEB has less space, the GC migh
+ * which are guaranteed to be reclaimable. If LEB has less space, the GC might
 * be unable to reclaim it. So, LEBs with free + dirty greater than dark
 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
 * good, and GC takes extra care when moving them.
@@ -57,14 +57,6 @@
 #include "ubifs.h"
 /*
- * GC tries to optimize the way it fit nodes to available space, and it sorts
- * nodes a little. The below constants are watermarks which define "large",
- * "medium", and "small" nodes.
- */
-#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
-#define SMALL_NODE_WM  UBIFS_MAX_DENT_NODE_SZ
-/*
 * GC may need to move more than one LEB to make progress. The below constants
 * define "soft" and "hard" limits on the number of LEBs the garbage collector
 * may move.
@@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c)
 }
 /**
- * joinup - bring data nodes for an inode together.
+ * list_sort - sort a list.
- * @c: UBIFS file-system description object
+ * @priv: private data, passed to @cmp
- * @sleb: describes scanned LEB
+ * @head: the list to sort
- * @inum: inode number
+ * @cmp: the elements comparison function
- * @blk: block number
- * @data: list to which to add data nodes
 *
- * This function looks at the first few nodes in the scanned LEB @sleb and adds
+ * This function has been implemented by Mark J Roberts <mjr@znex.org>. It
- * them to @data if they are data nodes from @inum and have a larger block
+ * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
- * number than @blk. This function returns %0 on success and a negative error
+ * in ascending order.
- * code on failure.
+ *
+ * The comparison function @cmp is supposed to return a negative value if @a is
+ * than @b, and a positive value if @a is greater than @b. If @a and @b are
+ * equivalent, then it does not matter what this function returns.
 */
-static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum,
+static void list_sort(void *priv, struct list_head *head,
-                  unsigned int blk, struct list_head *data)
+                      int (*cmp)(void *priv, struct list_head *a,
+                                 struct list_head *b))
 {
-        int err, cnt = 6, lnum = sleb->lnum, offs;
+        struct list_head *p, *q, *e, *list, *tail, *oldhead;
-        struct ubifs_scan_node *snod, *tmp;
+        int insize, nmerges, psize, qsize, i;
-        union ubifs_key *key;
+        if (list_empty(head))
+                return;
+        list = head->next;
+        list_del(head);
+        insize = 1;
+        for (;;) {
+                p = oldhead = list;
+                list = tail = NULL;
+                nmerges = 0;
+                while (p) {
+                        nmerges++;
+                        q = p;
+                        psize = 0;
+                        for (i = 0; i < insize; i++) {
+                                psize++;
+                                q = q->next == oldhead ? NULL : q->next;
+                                if (!q)
+                                        break;
+                        }
-        list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+                        qsize = insize;
-                key = &snod->key;
+                        while (psize > 0 || (qsize > 0 && q)) {
-                if (key_inum(c, key) == inum &&
+                                if (!psize) {
-                    key_type(c, key) == UBIFS_DATA_KEY &&
+                                        e = q;
-                    key_block(c, key) > blk) {
+                                        q = q->next;
-                        offs = snod->offs;
+                                        qsize--;
-                        err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0);
+                                        if (q == oldhead)
-                        if (err < 0)
+                                                q = NULL;
-                                return err;
+                                } else if (!qsize || !q) {
-                        list_del(&snod->list);
+                                        e = p;
-                        if (err) {
+                                        p = p->next;
-                                list_add_tail(&snod->list, data);
+                                        psize--;
-                                blk = key_block(c, key);
+                                        if (p == oldhead)
-                        } else
+                                                p = NULL;
-                                kfree(snod);
+                                } else if (cmp(priv, p, q) <= 0) {
-                        cnt = 6;
+                                        e = p;
-                } else if (--cnt == 0)
+                                        p = p->next;
+                                        psize--;
+                                        if (p == oldhead)
+                                                p = NULL;
+                                } else {
+                                        e = q;
+                                        q = q->next;
+                                        qsize--;
+                                        if (q == oldhead)
+                                                q = NULL;
+                                }
+                                if (tail)
+                                        tail->next = e;
+                                else
+                                        list = e;
+                                e->prev = tail;
+                                tail = e;
+                        }
+                        p = q;
+                }
+                tail->next = list;
+                list->prev = tail;
+                if (nmerges <= 1)
                        break;
+                insize *= 2;
        }
-        return 0;
+        head->next = list;
+        head->prev = list->prev;
+        list->prev->next = head;
+        list->prev = head;
 }
 /**
- * move_nodes - move nodes.
+ * data_nodes_cmp - compare 2 data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first data node
+ * @a: second data node
+ *
+ * This function compares data nodes @a and @b. Returns %1 if @a has greater
+ * inode or block number, and %-1 otherwise.
+ */
+int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+        ino_t inuma, inumb;
+        struct ubifs_info *c = priv;
+        struct ubifs_scan_node *sa, *sb;
+        cond_resched();
+        sa = list_entry(a, struct ubifs_scan_node, list);
+        sb = list_entry(b, struct ubifs_scan_node, list);
+        ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
+        ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
+        inuma = key_inum(c, &sa->key);
+        inumb = key_inum(c, &sb->key);
+        if (inuma == inumb) {
+                unsigned int blka = key_block(c, &sa->key);
+                unsigned int blkb = key_block(c, &sb->key);
+                if (blka <= blkb)
+                        return -1;
+        } else if (inuma <= inumb)
+                return -1;
+        return 1;
+}
+/*
+ * nondata_nodes_cmp - compare 2 non-data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first node
+ * @a: second node
+ *
+ * This function compares nodes @a and @b. It makes sure that inode nodes go
+ * first and sorted by length in descending order. Directory entry nodes go
+ * after inode nodes and are sorted in ascending hash valuer order.
+ */
+int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+        int typea, typeb;
+        ino_t inuma, inumb;
+        struct ubifs_info *c = priv;
+        struct ubifs_scan_node *sa, *sb;
+        cond_resched();
+        sa = list_entry(a, struct ubifs_scan_node, list);
+        sb = list_entry(b, struct ubifs_scan_node, list);
+        typea = key_type(c, &sa->key);
+        typeb = key_type(c, &sb->key);
+        ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
+        /* Inodes go before directory entries */
+        if (typea == UBIFS_INO_KEY) {
+                if (typeb == UBIFS_INO_KEY)
+                        return sb->len - sa->len;
+                return -1;
+        }
+        if (typeb == UBIFS_INO_KEY)
+                return 1;
+        ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
+        inuma = key_inum(c, &sa->key);
+        inumb = key_inum(c, &sb->key);
+        if (inuma == inumb) {
+                uint32_t hasha = key_hash(c, &sa->key);
+                uint32_t hashb = key_hash(c, &sb->key);
+                if (hasha <= hashb)
+                        return -1;
+        } else if (inuma <= inumb)
+                return -1;
+        return 1;
+}
+/**
+ * sort_nodes - sort nodes for GC.
 * @c: UBIFS file-system description object
- * @sleb: describes nodes to move
+ * @sleb: describes nodes to sort and contains the result on exit
+ * @nondata: contains non-data nodes on exit
+ * @min: minimum node size is returned here
 *
- * This function moves valid nodes from data LEB described by @sleb to the GC
+ * This function sorts the list of inodes to garbage collect. First of all, it
- * journal head. The obsolete nodes are dropped.
+ * kills obsolete nodes and separates data and non-data nodes to the
+ * @sleb->nodes and @nondata lists correspondingly.
+ *
+ * Data nodes are then sorted in block number order - this is important for
+ * bulk-read; data nodes with lower inode number go before data nodes with
+ * higher inode number, and data nodes with lower block number go before data
+ * nodes with higher block number;
 *
- * When moving nodes we have to deal with classical bin-packing problem: the
+ * Non-data nodes are sorted as follows.
- * space in the current GC journal head LEB and in @c->gc_lnum are the "bins",
+ *   o First go inode nodes - they are sorted in descending length order.
- * where the nodes in the @sleb->nodes list are the elements which should be
+ *   o Then go directory entry nodes - they are sorted in hash order, which
- * fit optimally to the bins. This function uses the "first fit decreasing"
+ *     should supposedly optimize 'readdir()'. Direntry nodes with lower parent
- * strategy, although it does not really sort the nodes but just split them on
+ *     inode number go before direntry nodes with higher parent inode number,
- * 3 classes - large, medium, and small, so they are roughly sorted.
+ *     and direntry nodes with lower name hash values go before direntry nodes
+ *     with higher name hash values.
 *
- * This function returns zero in case of success, %-EAGAIN if commit is
+ * This function returns zero in case of success and a negative error code in
- * required, and other negative error codes in case of other failures.
+ * case of failure.
 */
-static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+                      struct list_head *nondata, int *min)
 {
        struct ubifs_scan_node *snod, *tmp;
-        struct list_head data, large, medium, small;
-        struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
-        int avail, err, min = INT_MAX;
-        unsigned int blk = 0;
-        ino_t inum = 0;
-        INIT_LIST_HEAD(&data);
+        *min = INT_MAX;
-        INIT_LIST_HEAD(&large);
-        INIT_LIST_HEAD(&medium);
-        INIT_LIST_HEAD(&small);
-        while (!list_empty(&sleb->nodes)) {
+        /* Separate data nodes and non-data nodes */
-                struct list_head *lst = sleb->nodes.next;
+        list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+                int err;
-                snod = list_entry(lst, struct ubifs_scan_node, list);
                ubifs_assert(snod->type != UBIFS_IDX_NODE);
                ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
                err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
                                         snod->offs, 0);
                if (err < 0)
-                        goto out;
+                        return err;
-                list_del(lst);
                if (!err) {
                        /* The node is obsolete, remove it from the list */
+                        list_del(&snod->list);
                        kfree(snod);
                        continue;
                }
-                /*
+                if (snod->len < *min)
-                 * Sort the list of nodes so that data nodes go first, large
+                        *min = snod->len;
-                 * nodes go second, and small nodes go last.
-                 */
+                if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
-                if (key_type(c, &snod->key) == UBIFS_DATA_KEY) {
+                        list_move_tail(&snod->list, nondata);
-                        if (inum != key_inum(c, &snod->key)) {
-                                if (inum) {
-                                        /*
-                                         * Try to move data nodes from the same
-                                         * inode together.
-                                         */
-                                        err = joinup(c, sleb, inum, blk, &data);
-                                        if (err)
-                                                goto out;
-                                }
-                                inum = key_inum(c, &snod->key);
-                                blk = key_block(c, &snod->key);
-                        }
-                        list_add_tail(lst, &data);
-                } else if (snod->len > MEDIUM_NODE_WM)
-                        list_add_tail(lst, &large);
-                else if (snod->len > SMALL_NODE_WM)
-                        list_add_tail(lst, &medium);
-                else
-                        list_add_tail(lst, &small);
-                /* And find the smallest node */
-                if (snod->len < min)
-                        min = snod->len;
        }
-        /*
+        /* Sort data and non-data nodes */
-         * Join the tree lists so that we'd have one roughly sorted list
+        list_sort(c, &sleb->nodes, &data_nodes_cmp);
-         * ('large' will be the head of the joined list).
+        list_sort(c, nondata, &nondata_nodes_cmp);
-         */
+        return 0;
-        list_splice(&data, &large);
+}
-        list_splice(&medium, large.prev);
-        list_splice(&small, large.prev);
+/**
+ * move_node - move a node.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ * @snod: the mode to move
+ * @wbuf: write-buffer to move node to
+ *
+ * This function moves node @snod to @wbuf, changes TNC correspondingly, and
+ * destroys @snod. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+                     struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
+{
+        int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
+        cond_resched();
+        err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
+        if (err)
+                return err;
+        err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
+                                snod->offs, new_lnum, new_offs,
+                                snod->len);
+        list_del(&snod->list);
+        kfree(snod);
+        return err;
+}
+/**
+ * move_nodes - move nodes.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ *
+ * This function moves valid nodes from data LEB described by @sleb to the GC
+ * journal head. This function returns zero in case of success, %-EAGAIN if
+ * commit is required, and other negative error codes in case of other
+ * failures.
+ */
+static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+{
+        int err, min;
+        LIST_HEAD(nondata);
+        struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
        if (wbuf->lnum == -1) {
                /*
@@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
                 */
                err = switch_gc_head(c);
                if (err)
-                        goto out;
+                        return err;
        }
+        err = sort_nodes(c, sleb, &nondata, &min);
+        if (err)
+                goto out;
        /* Write nodes to their new location. Use the first-fit strategy */
        while (1) {
-                avail = c->leb_size - wbuf->offs - wbuf->used;
+                int avail;
-                list_for_each_entry_safe(snod, tmp, &large, list) {
+                struct ubifs_scan_node *snod, *tmp;
-                        int new_lnum, new_offs;
+                /* Move data nodes */
+                list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+                        avail = c->leb_size - wbuf->offs - wbuf->used;
+                        if  (snod->len > avail)
+                                /*
+                                 * Do not skip data nodes in order to optimize
+                                 * bulk-read.
+                                 */
+                                break;
+                        err = move_node(c, sleb, snod, wbuf);
+                        if (err)
+                                goto out;
+                }
+                /* Move non-data nodes */
+                list_for_each_entry_safe(snod, tmp, &nondata, list) {
+                        avail = c->leb_size - wbuf->offs - wbuf->used;
                        if (avail < min)
                                break;
-                        if (snod->len > avail)
+                        if  (snod->len > avail) {
-                                /* This node does not fit */
+                                /*
+                                 * Keep going only if this is an inode with
+                                 * some data. Otherwise stop and switch the GC
+                                 * head. IOW, we assume that data-less inode
+                                 * nodes and direntry nodes are roughly of the
+                                 * same size.
+                                 */
+                                if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
+                                    snod->len == UBIFS_INO_NODE_SZ)
+                                        break;
                                continue;
+                        }
-                        cond_resched();
+                        err = move_node(c, sleb, snod, wbuf);
-                        new_lnum = wbuf->lnum;
-                        new_offs = wbuf->offs + wbuf->used;
-                        err = ubifs_wbuf_write_nolock(wbuf, snod->node,
-                                                      snod->len);
                        if (err)
                                goto out;
-                        err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
-                                                snod->offs, new_lnum, new_offs,
-                                                snod->len);
-                        if (err)
-                                goto out;
-                        avail = c->leb_size - wbuf->offs - wbuf->used;
-                        list_del(&snod->list);
-                        kfree(snod);
                }
-                if (list_empty(&large))
+                if (list_empty(&sleb->nodes) && list_empty(&nondata))
                        break;
                /*
@@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
        return 0;
 out:
-        list_for_each_entry_safe(snod, tmp, &large, list) {
+        list_splice_tail(&nondata, &sleb->nodes);
-                list_del(&snod->list);
-                kfree(snod);
-        }
        return err;
 }
author	Artem Bityutskiy <Artem.Bityutskiy@nokia.com>	2009-03-08 09:13:00 -0400
committer	Artem Bityutskiy <Artem.Bityutskiy@nokia.com>	2009-03-20 13:12:00 -0400
commit	f10770f5e56b4297701fd7c3e551b206f98d7ac2 (patch)
tree	5fad34defa002857fca21dfde03e3813d68cfb4b /fs/ubifs
parent	7d4e9ccb435e51e013e63abd340b4f496428139c (diff)

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index a711d33b3d3..f0f5f15d384 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c
@@ -47,7 +47,7 @@
47	* have to waste large pieces of free space at the end of LEB B, because nodes	47	* have to waste large pieces of free space at the end of LEB B, because nodes
48	* from LEB A would not fit. And the worst situation is when all nodes are of	48	* from LEB A would not fit. And the worst situation is when all nodes are of
49	* maximum size. So dark watermark is the amount of free + dirty space in LEB	49	* maximum size. So dark watermark is the amount of free + dirty space in LEB
50	* which are guaranteed to be reclaimable. If LEB has less space, the GC migh	50	* which are guaranteed to be reclaimable. If LEB has less space, the GC might
51	* be unable to reclaim it. So, LEBs with free + dirty greater than dark	51	* be unable to reclaim it. So, LEBs with free + dirty greater than dark
52	* watermark are "good" LEBs from GC's point of few. The other LEBs are not so	52	* watermark are "good" LEBs from GC's point of few. The other LEBs are not so
53	* good, and GC takes extra care when moving them.	53	* good, and GC takes extra care when moving them.
@@ -57,14 +57,6 @@
57	#include "ubifs.h"	57	#include "ubifs.h"
58		58
59	/*	59	/*
60	* GC tries to optimize the way it fit nodes to available space, and it sorts
61	* nodes a little. The below constants are watermarks which define "large",
62	* "medium", and "small" nodes.
63	*/
64	#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
65	#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ
66
67	/*
68	* GC may need to move more than one LEB to make progress. The below constants	60	* GC may need to move more than one LEB to make progress. The below constants
69	* define "soft" and "hard" limits on the number of LEBs the garbage collector	61	* define "soft" and "hard" limits on the number of LEBs the garbage collector
70	* may move.	62	* may move.
@@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c)
116	}	108	}
117		109
118	/**	110	/**
119	* joinup - bring data nodes for an inode together.	111	* list_sort - sort a list.
120	* @c: UBIFS file-system description object	112	* @priv: private data, passed to @cmp
121	* @sleb: describes scanned LEB	113	* @head: the list to sort
122	* @inum: inode number	114	* @cmp: the elements comparison function
123	* @blk: block number
124	* @data: list to which to add data nodes
125	*	115	*
126	* This function looks at the first few nodes in the scanned LEB @sleb and adds	116	* This function has been implemented by Mark J Roberts <mjr@znex.org>. It
127	* them to @data if they are data nodes from @inum and have a larger block	117	* implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
128	* number than @blk. This function returns %0 on success and a negative error	118	* in ascending order.
129	* code on failure.	119	*
		120	* The comparison function @cmp is supposed to return a negative value if @a is
		121	* than @b, and a positive value if @a is greater than @b. If @a and @b are
		122	* equivalent, then it does not matter what this function returns.
130	*/	123	*/
131	static int joinup(struct ubifs_info c, struct ubifs_scan_leb sleb, ino_t inum,	124	static void list_sort(void priv, struct list_head head,
132	unsigned int blk, struct list_head *data)	125	int (cmp)(void priv, struct list_head *a,
		126	struct list_head *b))
133	{	127	{
134	int err, cnt = 6, lnum = sleb->lnum, offs;	128	struct list_head p, q, e, list, tail, oldhead;
135	struct ubifs_scan_node snod, tmp;	129	int insize, nmerges, psize, qsize, i;
136	union ubifs_key *key;	130
		131	if (list_empty(head))
		132	return;
		133
		134	list = head->next;
		135	list_del(head);
		136	insize = 1;
		137	for (;;) {
		138	p = oldhead = list;
		139	list = tail = NULL;
		140	nmerges = 0;
		141
		142	while (p) {
		143	nmerges++;
		144	q = p;
		145	psize = 0;
		146	for (i = 0; i < insize; i++) {
		147	psize++;
		148	q = q->next == oldhead ? NULL : q->next;
		149	if (!q)
		150	break;
		151	}
137		152
138	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {	153	qsize = insize;
139	key = &snod->key;	154	while (psize > 0 \|\| (qsize > 0 && q)) {
140	if (key_inum(c, key) == inum &&	155	if (!psize) {
141	key_type(c, key) == UBIFS_DATA_KEY &&	156	e = q;
142	key_block(c, key) > blk) {	157	q = q->next;
143	offs = snod->offs;	158	qsize--;
144	err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0);	159	if (q == oldhead)
145	if (err < 0)	160	q = NULL;
146	return err;	161	} else if (!qsize \|\| !q) {
147	list_del(&snod->list);	162	e = p;
148	if (err) {	163	p = p->next;
149	list_add_tail(&snod->list, data);	164	psize--;
150	blk = key_block(c, key);	165	if (p == oldhead)
151	} else	166	p = NULL;
152	kfree(snod);	167	} else if (cmp(priv, p, q) <= 0) {
153	cnt = 6;	168	e = p;
154	} else if (--cnt == 0)	169	p = p->next;
		170	psize--;
		171	if (p == oldhead)
		172	p = NULL;
		173	} else {
		174	e = q;
		175	q = q->next;
		176	qsize--;
		177	if (q == oldhead)
		178	q = NULL;
		179	}
		180	if (tail)
		181	tail->next = e;
		182	else
		183	list = e;
		184	e->prev = tail;
		185	tail = e;
		186	}
		187	p = q;
		188	}
		189
		190	tail->next = list;
		191	list->prev = tail;
		192
		193	if (nmerges <= 1)
155	break;	194	break;
		195
		196	insize *= 2;
156	}	197	}
157	return 0;	198
		199	head->next = list;
		200	head->prev = list->prev;
		201	list->prev->next = head;
		202	list->prev = head;
158	}	203	}
159		204
160	/**	205	/**
161	* move_nodes - move nodes.	206	* data_nodes_cmp - compare 2 data nodes.
		207	* @priv: UBIFS file-system description object
		208	* @a: first data node
		209	* @a: second data node
		210	*
		211	* This function compares data nodes @a and @b. Returns %1 if @a has greater
		212	* inode or block number, and %-1 otherwise.
		213	*/
		214	int data_nodes_cmp(void priv, struct list_head a, struct list_head *b)
		215	{
		216	ino_t inuma, inumb;
		217	struct ubifs_info *c = priv;
		218	struct ubifs_scan_node sa, sb;
		219
		220	cond_resched();
		221	sa = list_entry(a, struct ubifs_scan_node, list);
		222	sb = list_entry(b, struct ubifs_scan_node, list);
		223	ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
		224	ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
		225
		226	inuma = key_inum(c, &sa->key);
		227	inumb = key_inum(c, &sb->key);
		228
		229	if (inuma == inumb) {
		230	unsigned int blka = key_block(c, &sa->key);
		231	unsigned int blkb = key_block(c, &sb->key);
		232
		233	if (blka <= blkb)
		234	return -1;
		235	} else if (inuma <= inumb)
		236	return -1;
		237
		238	return 1;
		239	}
		240
		241	/*
		242	* nondata_nodes_cmp - compare 2 non-data nodes.
		243	* @priv: UBIFS file-system description object
		244	* @a: first node
		245	* @a: second node
		246	*
		247	* This function compares nodes @a and @b. It makes sure that inode nodes go
		248	* first and sorted by length in descending order. Directory entry nodes go
		249	* after inode nodes and are sorted in ascending hash valuer order.
		250	*/
		251	int nondata_nodes_cmp(void priv, struct list_head a, struct list_head *b)
		252	{
		253	int typea, typeb;
		254	ino_t inuma, inumb;
		255	struct ubifs_info *c = priv;
		256	struct ubifs_scan_node sa, sb;
		257
		258	cond_resched();
		259	sa = list_entry(a, struct ubifs_scan_node, list);
		260	sb = list_entry(b, struct ubifs_scan_node, list);
		261	typea = key_type(c, &sa->key);
		262	typeb = key_type(c, &sb->key);
		263	ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
		264
		265	/* Inodes go before directory entries */
		266	if (typea == UBIFS_INO_KEY) {
		267	if (typeb == UBIFS_INO_KEY)
		268	return sb->len - sa->len;
		269	return -1;
		270	}
		271	if (typeb == UBIFS_INO_KEY)
		272	return 1;
		273
		274	ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
		275	inuma = key_inum(c, &sa->key);
		276	inumb = key_inum(c, &sb->key);
		277
		278	if (inuma == inumb) {
		279	uint32_t hasha = key_hash(c, &sa->key);
		280	uint32_t hashb = key_hash(c, &sb->key);
		281
		282	if (hasha <= hashb)
		283	return -1;
		284	} else if (inuma <= inumb)
		285	return -1;
		286
		287	return 1;
		288	}
		289
		290	/**
		291	* sort_nodes - sort nodes for GC.
162	* @c: UBIFS file-system description object	292	* @c: UBIFS file-system description object
163	* @sleb: describes nodes to move	293	* @sleb: describes nodes to sort and contains the result on exit
		294	* @nondata: contains non-data nodes on exit
		295	* @min: minimum node size is returned here
164	*	296	*
165	* This function moves valid nodes from data LEB described by @sleb to the GC	297	* This function sorts the list of inodes to garbage collect. First of all, it
166	* journal head. The obsolete nodes are dropped.	298	* kills obsolete nodes and separates data and non-data nodes to the
		299	* @sleb->nodes and @nondata lists correspondingly.
		300	*
		301	* Data nodes are then sorted in block number order - this is important for
		302	* bulk-read; data nodes with lower inode number go before data nodes with
		303	* higher inode number, and data nodes with lower block number go before data
		304	* nodes with higher block number;
167	*	305	*
168	* When moving nodes we have to deal with classical bin-packing problem: the	306	* Non-data nodes are sorted as follows.
169	* space in the current GC journal head LEB and in @c->gc_lnum are the "bins",	307	* o First go inode nodes - they are sorted in descending length order.
170	* where the nodes in the @sleb->nodes list are the elements which should be	308	* o Then go directory entry nodes - they are sorted in hash order, which
171	* fit optimally to the bins. This function uses the "first fit decreasing"	309	* should supposedly optimize 'readdir()'. Direntry nodes with lower parent
172	* strategy, although it does not really sort the nodes but just split them on	310	* inode number go before direntry nodes with higher parent inode number,
173	* 3 classes - large, medium, and small, so they are roughly sorted.	311	* and direntry nodes with lower name hash values go before direntry nodes
		312	* with higher name hash values.
174	*	313	*
175	* This function returns zero in case of success, %-EAGAIN if commit is	314	* This function returns zero in case of success and a negative error code in
176	* required, and other negative error codes in case of other failures.	315	* case of failure.
177	*/	316	*/
178	static int move_nodes(struct ubifs_info c, struct ubifs_scan_leb sleb)	317	static int sort_nodes(struct ubifs_info c, struct ubifs_scan_leb sleb,
		318	struct list_head nondata, int min)
179	{	319	{
180	struct ubifs_scan_node snod, tmp;	320	struct ubifs_scan_node snod, tmp;
181	struct list_head data, large, medium, small;
182	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
183	int avail, err, min = INT_MAX;
184	unsigned int blk = 0;
185	ino_t inum = 0;
186		321
187	INIT_LIST_HEAD(&data);	322	*min = INT_MAX;
188	INIT_LIST_HEAD(&large);
189	INIT_LIST_HEAD(&medium);
190	INIT_LIST_HEAD(&small);
191		323
192	while (!list_empty(&sleb->nodes)) {	324	/* Separate data nodes and non-data nodes */
193	struct list_head *lst = sleb->nodes.next;	325	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
194		326	int err;
195	snod = list_entry(lst, struct ubifs_scan_node, list);
196		327
197	ubifs_assert(snod->type != UBIFS_IDX_NODE);	328	ubifs_assert(snod->type != UBIFS_IDX_NODE);
198	ubifs_assert(snod->type != UBIFS_REF_NODE);	329	ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info c, struct ubifs_scan_leb sleb)
201	err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,	332	err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
202	snod->offs, 0);	333	snod->offs, 0);
203	if (err < 0)	334	if (err < 0)
204	goto out;	335	return err;
205		336
206	list_del(lst);
207	if (!err) {	337	if (!err) {
208	/* The node is obsolete, remove it from the list */	338	/* The node is obsolete, remove it from the list */
		339	list_del(&snod->list);
209	kfree(snod);	340	kfree(snod);
210	continue;	341	continue;
211	}	342	}
212		343
213	/*	344	if (snod->len < *min)
214	* Sort the list of nodes so that data nodes go first, large	345	*min = snod->len;
215	* nodes go second, and small nodes go last.	346
216	*/	347	if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
217	if (key_type(c, &snod->key) == UBIFS_DATA_KEY) {	348	list_move_tail(&snod->list, nondata);
218	if (inum != key_inum(c, &snod->key)) {
219	if (inum) {
220	/*
221	* Try to move data nodes from the same
222	* inode together.
223	*/
224	err = joinup(c, sleb, inum, blk, &data);
225	if (err)
226	goto out;
227	}
228	inum = key_inum(c, &snod->key);
229	blk = key_block(c, &snod->key);
230	}
231	list_add_tail(lst, &data);
232	} else if (snod->len > MEDIUM_NODE_WM)
233	list_add_tail(lst, &large);
234	else if (snod->len > SMALL_NODE_WM)
235	list_add_tail(lst, &medium);
236	else
237	list_add_tail(lst, &small);
238
239	/* And find the smallest node */
240	if (snod->len < min)
241	min = snod->len;
242	}	349	}
243		350
244	/*	351	/* Sort data and non-data nodes */
245	* Join the tree lists so that we'd have one roughly sorted list	352	list_sort(c, &sleb->nodes, &data_nodes_cmp);
246	* ('large' will be the head of the joined list).	353	list_sort(c, nondata, &nondata_nodes_cmp);
247	*/	354	return 0;
248	list_splice(&data, &large);	355	}
249	list_splice(&medium, large.prev);	356
250	list_splice(&small, large.prev);	357	/**
		358	* move_node - move a node.
		359	* @c: UBIFS file-system description object
		360	* @sleb: describes the LEB to move nodes from
		361	* @snod: the mode to move
		362	* @wbuf: write-buffer to move node to
		363	*
		364	* This function moves node @snod to @wbuf, changes TNC correspondingly, and
		365	* destroys @snod. Returns zero in case of success and a negative error code in
		366	* case of failure.
		367	*/
		368	static int move_node(struct ubifs_info c, struct ubifs_scan_leb sleb,
		369	struct ubifs_scan_node snod, struct ubifs_wbuf wbuf)
		370	{
		371	int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
		372
		373	cond_resched();
		374	err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
		375	if (err)
		376	return err;
		377
		378	err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
		379	snod->offs, new_lnum, new_offs,
		380	snod->len);
		381	list_del(&snod->list);
		382	kfree(snod);
		383	return err;
		384	}
		385
		386	/**
		387	* move_nodes - move nodes.
		388	* @c: UBIFS file-system description object
		389	* @sleb: describes the LEB to move nodes from
		390	*
		391	* This function moves valid nodes from data LEB described by @sleb to the GC
		392	* journal head. This function returns zero in case of success, %-EAGAIN if
		393	* commit is required, and other negative error codes in case of other
		394	* failures.
		395	*/
		396	static int move_nodes(struct ubifs_info c, struct ubifs_scan_leb sleb)
		397	{
		398	int err, min;
		399	LIST_HEAD(nondata);
		400	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
251		401
252	if (wbuf->lnum == -1) {	402	if (wbuf->lnum == -1) {
253	/*	403	/*
@@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info c, struct ubifs_scan_leb sleb)
256	*/	406	*/
257	err = switch_gc_head(c);	407	err = switch_gc_head(c);
258	if (err)	408	if (err)
259	goto out;	409	return err;
260	}	410	}
261		411
		412	err = sort_nodes(c, sleb, &nondata, &min);
		413	if (err)
		414	goto out;
		415
262	/* Write nodes to their new location. Use the first-fit strategy */	416	/* Write nodes to their new location. Use the first-fit strategy */
263	while (1) {	417	while (1) {
264	avail = c->leb_size - wbuf->offs - wbuf->used;	418	int avail;
265	list_for_each_entry_safe(snod, tmp, &large, list) {	419	struct ubifs_scan_node snod, tmp;
266	int new_lnum, new_offs;	420
		421	/* Move data nodes */
		422	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
		423	avail = c->leb_size - wbuf->offs - wbuf->used;
		424	if (snod->len > avail)
		425	/*
		426	* Do not skip data nodes in order to optimize
		427	* bulk-read.
		428	*/
		429	break;
		430
		431	err = move_node(c, sleb, snod, wbuf);
		432	if (err)
		433	goto out;
		434	}
267		435
		436	/* Move non-data nodes */
		437	list_for_each_entry_safe(snod, tmp, &nondata, list) {
		438	avail = c->leb_size - wbuf->offs - wbuf->used;
268	if (avail < min)	439	if (avail < min)
269	break;	440	break;
270		441
271	if (snod->len > avail)	442	if (snod->len > avail) {
272	/* This node does not fit */	443	/*
		444	* Keep going only if this is an inode with
		445	* some data. Otherwise stop and switch the GC
		446	* head. IOW, we assume that data-less inode
		447	* nodes and direntry nodes are roughly of the
		448	* same size.
		449	*/
		450	if (key_type(c, &snod->key) == UBIFS_DENT_KEY \|\|
		451	snod->len == UBIFS_INO_NODE_SZ)
		452	break;
273	continue;	453	continue;
		454	}
274		455
275	cond_resched();	456	err = move_node(c, sleb, snod, wbuf);
276
277	new_lnum = wbuf->lnum;
278	new_offs = wbuf->offs + wbuf->used;
279	err = ubifs_wbuf_write_nolock(wbuf, snod->node,
280	snod->len);
281	if (err)	457	if (err)
282	goto out;	458	goto out;
283	err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
284	snod->offs, new_lnum, new_offs,
285	snod->len);
286	if (err)
287	goto out;
288
289	avail = c->leb_size - wbuf->offs - wbuf->used;
290	list_del(&snod->list);
291	kfree(snod);
292	}	459	}
293		460
294	if (list_empty(&large))	461	if (list_empty(&sleb->nodes) && list_empty(&nondata))
295	break;	462	break;
296		463
297	/*	464	/*
@@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info c, struct ubifs_scan_leb sleb)
306	return 0;	473	return 0;
307		474
308	out:	475	out:
309	list_for_each_entry_safe(snod, tmp, &large, list) {	476	list_splice_tail(&nondata, &sleb->nodes);
310	list_del(&snod->list);
311	kfree(snod);
312	}
313	return err;	477	return err;
314	}	478	}
315		479