diff options
| author | Björn B. Brandenburg <bbb@cs.unc.edu> | 2010-10-12 17:52:30 -0400 |
|---|---|---|
| committer | Björn B. Brandenburg <bbb@cs.unc.edu> | 2010-10-12 17:52:51 -0400 |
| commit | 3da42149996ac964b2f6fd6a6673e1b59522a5c1 (patch) | |
| tree | c8f1867a8bfc09c781f64e5313cfa72fc30eed4b | |
| parent | 55d915be53a20c416d8d83c55889c29e9c788a7c (diff) | |
add tool for merging multiple data files into one
| -rwxr-xr-x | merge.py | 150 |
1 files changed, 150 insertions, 0 deletions
diff --git a/merge.py b/merge.py new file mode 100755 index 0000000..153c0c2 --- /dev/null +++ b/merge.py | |||
| @@ -0,0 +1,150 @@ | |||
| 1 | #!/usr/bin/env python | ||
| 2 | |||
| 3 | import defapp | ||
| 4 | from optparse import make_option as o | ||
| 5 | from os.path import splitext, basename, split | ||
| 6 | |||
| 7 | import os | ||
| 8 | |||
| 9 | import sys | ||
| 10 | |||
| 11 | from util import load_csv_file | ||
| 12 | |||
| 13 | from wsched import split_rows, group, filter_comments, print_row | ||
| 14 | |||
| 15 | |||
| 16 | def data_from(fname, keycol): | ||
| 17 | data = load_csv_file(fname) | ||
| 18 | return group(data, keycol) | ||
| 19 | |||
| 20 | def merged_data(fnames, keycol, replace=True): | ||
| 21 | all_data = {} | ||
| 22 | for fname in fnames: | ||
| 23 | try: | ||
| 24 | for (key, rows) in data_from(fname, keycol): | ||
| 25 | if key in all_data and not replace: | ||
| 26 | all_data[key].extend(rows) | ||
| 27 | else: | ||
| 28 | all_data[key] = rows | ||
| 29 | except IOError as ioe: | ||
| 30 | print >>sys.stderr, "[!!] Skipping %s; %s." % (fname, str(ioe)) | ||
| 31 | for key in sorted(all_data.keys()): | ||
| 32 | for row in all_data[key]: | ||
| 33 | yield row | ||
| 34 | |||
| 35 | def substituted_data(origname, files, keycol): | ||
| 36 | data = load_csv_file(origname) | ||
| 37 | (by_key, order) = split_rows(data, keycol) | ||
| 38 | for (idx, fname) in files: | ||
| 39 | try: | ||
| 40 | for (key, rows) in data_from(fname, keycol): | ||
| 41 | if key in by_key: | ||
| 42 | for orow, nrow in zip(by_key[key], rows): | ||
| 43 | # substitute | ||
| 44 | orow[idx] = nrow[idx] | ||
| 45 | except IOError as ioe: | ||
| 46 | print >>sys.stderr, "[!!] Skipping %s; %s." % (fname, str(ioe)) | ||
| 47 | for key in order: | ||
| 48 | for row in by_key[key]: | ||
| 49 | yield row | ||
| 50 | |||
| 51 | |||
| 52 | options = [ | ||
| 53 | # output options | ||
| 54 | o('-o', '--output-prefix', action='store', dest='prefix'), | ||
| 55 | o('-k', '--key-column', action='store', type='int', dest='keycol'), | ||
| 56 | o(None, '--no-comments', action='store_true', dest='no_comments'), | ||
| 57 | o('-w', '--col-width', action='store', type='int', dest='colwidth'), | ||
| 58 | o('-d', '--dir', action='append', dest='merge_dirs'), | ||
| 59 | o('-s', '--sub', action='append', type='int', dest='sub_col'), | ||
| 60 | ] | ||
| 61 | |||
| 62 | defaults = { | ||
| 63 | 'prefix' : None, | ||
| 64 | 'keycol' : 0, | ||
| 65 | 'no_comments' : False, | ||
| 66 | 'colwidth' : 10, | ||
| 67 | 'precision' : 3, | ||
| 68 | 'indent' : 2, | ||
| 69 | 'merge_dirs' : [], | ||
| 70 | 'sub_col' : [], | ||
| 71 | } | ||
| 72 | |||
| 73 | |||
| 74 | class DataFileMerger(defapp.App): | ||
| 75 | def __init__(self): | ||
| 76 | defapp.App.__init__(self, options, defaults, no_std_opts=True) | ||
| 77 | self.out = sys.stdout | ||
| 78 | |||
| 79 | def merge_file(self, bname, dirs, fout): | ||
| 80 | out = open(fout, 'w') | ||
| 81 | files = [os.path.join(d, bname) for d in dirs] | ||
| 82 | if not self.options.no_comments: | ||
| 83 | for src in files: | ||
| 84 | filter_comments(src, out=out) | ||
| 85 | for r in merged_data(files, self.options.keycol): | ||
| 86 | print_row(r, out, self.fmt) | ||
| 87 | out.close() | ||
| 88 | |||
| 89 | |||
| 90 | def sub_column(self, bname, dirs, fout): | ||
| 91 | out = open(fout, 'w') | ||
| 92 | files = [os.path.join(d, bname) for d in dirs] | ||
| 93 | if not self.options.no_comments: | ||
| 94 | for src in files: | ||
| 95 | filter_comments(src, out=out) | ||
| 96 | srcs = zip(self.options.sub_col, files[1:]) | ||
| 97 | print fout, '<-', files[0], '+', srcs | ||
| 98 | for r in substituted_data(files[0], srcs, | ||
| 99 | self.options.keycol): | ||
| 100 | print_row(r, out, self.fmt) | ||
| 101 | out.close() | ||
| 102 | |||
| 103 | def target_name(self, fname, extra=None): | ||
| 104 | path, bname = os.path.split(fname) | ||
| 105 | if extra: | ||
| 106 | bname, ext = os.path.splitext(bname) | ||
| 107 | bname = "%s%s%s" % (bname, extra, ext) | ||
| 108 | return (self.options.prefix + bname, bname, [path] + self.options.merge_dirs) | ||
| 109 | |||
| 110 | def merge(self): | ||
| 111 | if self.options.prefix: | ||
| 112 | for i, datafile in enumerate(self.args): | ||
| 113 | (fout, bname, dirs) = self.target_name(datafile) | ||
| 114 | print "[%d/%d] Merging {%s}/%s -> %s" % \ | ||
| 115 | (i + 1, len(self.args), ",".join(dirs), bname, fout) | ||
| 116 | try: | ||
| 117 | self.merge_file(bname, dirs, fout) | ||
| 118 | except IOError as ioe: | ||
| 119 | sys.stderr.write("[!!] ") | ||
| 120 | sys.stderr.write(str(ioe)) | ||
| 121 | sys.stderr.write("\n") | ||
| 122 | else: | ||
| 123 | print "Requires output prefix (-o)." | ||
| 124 | |||
| 125 | def sub(self): | ||
| 126 | if self.options.prefix: | ||
| 127 | for i, datafile in enumerate(self.args): | ||
| 128 | (fout, bname, dirs) = self.target_name(datafile) | ||
| 129 | print "[%d/%d] Column-merging {%s}/%s -> %s" % \ | ||
| 130 | (i + 1, len(self.args), ",".join(dirs), bname, fout) | ||
| 131 | try: | ||
| 132 | self.sub_column(bname, dirs, fout) | ||
| 133 | except IOError as ioe: | ||
| 134 | sys.stderr.write("[!!] ") | ||
| 135 | sys.stderr.write(str(ioe)) | ||
| 136 | sys.stderr.write("\n") | ||
| 137 | else: | ||
| 138 | print "Requires output prefix (-o)." | ||
| 139 | |||
| 140 | def default(self, _): | ||
| 141 | self.fmt = "%%%d.%df" % (self.options.colwidth, self.options.precision) | ||
| 142 | print self.options.sub_col | ||
| 143 | if self.options.sub_col: | ||
| 144 | self.sub() | ||
| 145 | else: | ||
| 146 | self.merge() | ||
| 147 | |||
| 148 | if __name__ == '__main__': | ||
| 149 | DataFileMerger().launch() | ||
| 150 | |||
