aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTimofey Titovets <nefelim4ag@gmail.com>2017-09-28 10:33:37 -0400
committerDavid Sterba <dsterba@suse.com>2017-11-01 15:45:36 -0400
commit17b5a6c17e265ca84fac9c3256ff86c691f04aab (patch)
tree2eb9c8dab73fa0f881a94c67ff7030adc4ec6630
parent4e439a0b184f014a5833fa468af36cc9f59b8fb1 (diff)
Btrfs: heuristic: add bucket and sample counters and other defines
Add basic defines and structures for data sampling. Added macros: - For future sampling algo - For bucket size Heuristic workspace: - Add bucket for storing byte type counters - Add sample array for storing partial copy of input data range - Add counter for store current sample size to workspace Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor coding style fixes, comments updated ] Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r--fs/btrfs/compression.c53
1 files changed, 52 insertions, 1 deletions
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b155542c5e6b..4f5d958c71ad 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -707,8 +707,47 @@ out:
707 return ret; 707 return ret;
708} 708}
709 709
710/*
711 * Heuristic uses systematic sampling to collect data from the input data
712 * range, the logic can be tuned by the following constants:
713 *
714 * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample
715 * @SAMPLING_INTERVAL - range from which the sampled data can be collected
716 */
717#define SAMPLING_READ_SIZE (16)
718#define SAMPLING_INTERVAL (256)
719
720/*
721 * For statistical analysis of the input data we consider bytes that form a
722 * Galois Field of 256 objects. Each object has an attribute count, ie. how
723 * many times the object appeared in the sample.
724 */
725#define BUCKET_SIZE (256)
726
727/*
728 * The size of the sample is based on a statistical sampling rule of thumb.
729 * The common way is to perform sampling tests as long as the number of
730 * elements in each cell is at least 5.
731 *
732 * Instead of 5, we choose 32 to obtain more accurate results.
733 * If the data contain the maximum number of symbols, which is 256, we obtain a
734 * sample size bound by 8192.
735 *
736 * For a sample of at most 8KB of data per data range: 16 consecutive bytes
737 * from up to 512 locations.
738 */
739#define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \
740 SAMPLING_READ_SIZE / SAMPLING_INTERVAL)
741
742struct bucket_item {
743 u32 count;
744};
710 745
711struct heuristic_ws { 746struct heuristic_ws {
747 /* Partial copy of input data */
748 u8 *sample;
749 /* Buckets store counters for each byte value */
750 struct bucket_item *bucket;
712 struct list_head list; 751 struct list_head list;
713}; 752};
714 753
@@ -718,6 +757,8 @@ static void free_heuristic_ws(struct list_head *ws)
718 757
719 workspace = list_entry(ws, struct heuristic_ws, list); 758 workspace = list_entry(ws, struct heuristic_ws, list);
720 759
760 kvfree(workspace->sample);
761 kfree(workspace->bucket);
721 kfree(workspace); 762 kfree(workspace);
722} 763}
723 764
@@ -729,9 +770,19 @@ static struct list_head *alloc_heuristic_ws(void)
729 if (!ws) 770 if (!ws)
730 return ERR_PTR(-ENOMEM); 771 return ERR_PTR(-ENOMEM);
731 772
732 INIT_LIST_HEAD(&ws->list); 773 ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL);
774 if (!ws->sample)
775 goto fail;
776
777 ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL);
778 if (!ws->bucket)
779 goto fail;
733 780
781 INIT_LIST_HEAD(&ws->list);
734 return &ws->list; 782 return &ws->list;
783fail:
784 free_heuristic_ws(&ws->list);
785 return ERR_PTR(-ENOMEM);
735} 786}
736 787
737struct workspaces_list { 788struct workspaces_list {