Btrfs: heuristic: add bucket and sample counters and other defines

Add basic defines and structures for data sampling. Added macros: - For future sampling algo - For bucket size Heuristic workspace: - Add bucket for storing byte type counters - Add sample array for storing partial copy of input data range - Add counter for store current sample size to workspace Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor coding style fixes, comments updated ] Signed-off-by: David Sterba <dsterba@suse.com>
author: Timofey Titovets <nefelim4ag@gmail.com> 2017-09-28 10:33:37 -0400
committer: David Sterba <dsterba@suse.com> 2017-11-01 15:45:36 -0400
commit: 17b5a6c17e265ca84fac9c3256ff86c691f04aab (patch)
tree: 2eb9c8dab73fa0f881a94c67ff7030adc4ec6630
parent: 4e439a0b184f014a5833fa468af36cc9f59b8fb1 (diff)
1 files changed, 52 insertions, 1 deletions
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b155542c5e6b..4f5d958c71ad 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -707,8 +707,47 @@ out:
        return ret;
 }
+/*
+ * Heuristic uses systematic sampling to collect data from the input data
+ * range, the logic can be tuned by the following constants:
+ *
+ * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample
+ * @SAMPLING_INTERVAL  - range from which the sampled data can be collected
+ */
+#define SAMPLING_READ_SIZE      (16)
+#define SAMPLING_INTERVAL       (256)
+/*
+ * For statistical analysis of the input data we consider bytes that form a
+ * Galois Field of 256 objects. Each object has an attribute count, ie. how
+ * many times the object appeared in the sample.
+ */
+#define BUCKET_SIZE             (256)
+/*
+ * The size of the sample is based on a statistical sampling rule of thumb.
+ * The common way is to perform sampling tests as long as the number of
+ * elements in each cell is at least 5.
+ *
+ * Instead of 5, we choose 32 to obtain more accurate results.
+ * If the data contain the maximum number of symbols, which is 256, we obtain a
+ * sample size bound by 8192.
+ *
+ * For a sample of at most 8KB of data per data range: 16 consecutive bytes
+ * from up to 512 locations.
+ */
+#define MAX_SAMPLE_SIZE         (BTRFS_MAX_UNCOMPRESSED *               \
+                                 SAMPLING_READ_SIZE / SAMPLING_INTERVAL)
+struct bucket_item {
+        u32 count;
+};
 struct heuristic_ws {
+        /* Partial copy of input data */
+        u8 *sample;
+        /* Buckets store counters for each byte value */
+        struct bucket_item *bucket;
        struct list_head list;
 };
@@ -718,6 +757,8 @@ static void free_heuristic_ws(struct list_head *ws)
        workspace = list_entry(ws, struct heuristic_ws, list);
+        kvfree(workspace->sample);
+        kfree(workspace->bucket);
        kfree(workspace);
 }
@@ -729,9 +770,19 @@ static struct list_head *alloc_heuristic_ws(void)
        if (!ws)
                return ERR_PTR(-ENOMEM);
-        INIT_LIST_HEAD(&ws->list);
+        ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL);
+        if (!ws->sample)
+                goto fail;
+        ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL);
+        if (!ws->bucket)
+                goto fail;
+        INIT_LIST_HEAD(&ws->list);
        return &ws->list;
+fail:
+        free_heuristic_ws(&ws->list);
+        return ERR_PTR(-ENOMEM);
 }
 struct workspaces_list {
author	Timofey Titovets <nefelim4ag@gmail.com>	2017-09-28 10:33:37 -0400
committer	David Sterba <dsterba@suse.com>	2017-11-01 15:45:36 -0400
commit	17b5a6c17e265ca84fac9c3256ff86c691f04aab (patch)
tree	2eb9c8dab73fa0f881a94c67ff7030adc4ec6630
parent	4e439a0b184f014a5833fa468af36cc9f59b8fb1 (diff)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b155542c5e6b..4f5d958c71ad 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c
@@ -707,8 +707,47 @@ out:
707	return ret;	707	return ret;
708	}	708	}
709		709
		710	/*
		711	* Heuristic uses systematic sampling to collect data from the input data
		712	* range, the logic can be tuned by the following constants:
		713	*
		714	* @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample
		715	* @SAMPLING_INTERVAL - range from which the sampled data can be collected
		716	*/
		717	#define SAMPLING_READ_SIZE (16)
		718	#define SAMPLING_INTERVAL (256)
		719
		720	/*
		721	* For statistical analysis of the input data we consider bytes that form a
		722	* Galois Field of 256 objects. Each object has an attribute count, ie. how
		723	* many times the object appeared in the sample.
		724	*/
		725	#define BUCKET_SIZE (256)
		726
		727	/*
		728	* The size of the sample is based on a statistical sampling rule of thumb.
		729	* The common way is to perform sampling tests as long as the number of
		730	* elements in each cell is at least 5.
		731	*
		732	* Instead of 5, we choose 32 to obtain more accurate results.
		733	* If the data contain the maximum number of symbols, which is 256, we obtain a
		734	* sample size bound by 8192.
		735	*
		736	* For a sample of at most 8KB of data per data range: 16 consecutive bytes
		737	* from up to 512 locations.
		738	*/
		739	#define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \
		740	SAMPLING_READ_SIZE / SAMPLING_INTERVAL)
		741
		742	struct bucket_item {
		743	u32 count;
		744	};
710		745
711	struct heuristic_ws {	746	struct heuristic_ws {
		747	/* Partial copy of input data */
		748	u8 *sample;
		749	/* Buckets store counters for each byte value */
		750	struct bucket_item *bucket;
712	struct list_head list;	751	struct list_head list;
713	};	752	};
714		753
@@ -718,6 +757,8 @@ static void free_heuristic_ws(struct list_head *ws)
718		757
719	workspace = list_entry(ws, struct heuristic_ws, list);	758	workspace = list_entry(ws, struct heuristic_ws, list);
720		759
		760	kvfree(workspace->sample);
		761	kfree(workspace->bucket);
721	kfree(workspace);	762	kfree(workspace);
722	}	763	}
723		764
@@ -729,9 +770,19 @@ static struct list_head *alloc_heuristic_ws(void)
729	if (!ws)	770	if (!ws)
730	return ERR_PTR(-ENOMEM);	771	return ERR_PTR(-ENOMEM);
731		772
732	INIT_LIST_HEAD(&ws->list);	773	ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL);
		774	if (!ws->sample)
		775	goto fail;
		776
		777	ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL);
		778	if (!ws->bucket)
		779	goto fail;
733		780
		781	INIT_LIST_HEAD(&ws->list);
734	return &ws->list;	782	return &ws->list;
		783	fail:
		784	free_heuristic_ws(&ws->list);
		785	return ERR_PTR(-ENOMEM);
735	}	786	}
736		787
737	struct workspaces_list {	788	struct workspaces_list {