From 3da42149996ac964b2f6fd6a6673e1b59522a5c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bjo=CC=88rn=20B=2E=20Brandenburg?= Date: Tue, 12 Oct 2010 17:52:30 -0400 Subject: add tool for merging multiple data files into one --- merge.py | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100755 merge.py diff --git a/merge.py b/merge.py new file mode 100755 index 0000000..153c0c2 --- /dev/null +++ b/merge.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python + +import defapp +from optparse import make_option as o +from os.path import splitext, basename, split + +import os + +import sys + +from util import load_csv_file + +from wsched import split_rows, group, filter_comments, print_row + + +def data_from(fname, keycol): + data = load_csv_file(fname) + return group(data, keycol) + +def merged_data(fnames, keycol, replace=True): + all_data = {} + for fname in fnames: + try: + for (key, rows) in data_from(fname, keycol): + if key in all_data and not replace: + all_data[key].extend(rows) + else: + all_data[key] = rows + except IOError as ioe: + print >>sys.stderr, "[!!] Skipping %s; %s." % (fname, str(ioe)) + for key in sorted(all_data.keys()): + for row in all_data[key]: + yield row + +def substituted_data(origname, files, keycol): + data = load_csv_file(origname) + (by_key, order) = split_rows(data, keycol) + for (idx, fname) in files: + try: + for (key, rows) in data_from(fname, keycol): + if key in by_key: + for orow, nrow in zip(by_key[key], rows): + # substitute + orow[idx] = nrow[idx] + except IOError as ioe: + print >>sys.stderr, "[!!] Skipping %s; %s." % (fname, str(ioe)) + for key in order: + for row in by_key[key]: + yield row + + +options = [ + # output options + o('-o', '--output-prefix', action='store', dest='prefix'), + o('-k', '--key-column', action='store', type='int', dest='keycol'), + o(None, '--no-comments', action='store_true', dest='no_comments'), + o('-w', '--col-width', action='store', type='int', dest='colwidth'), + o('-d', '--dir', action='append', dest='merge_dirs'), + o('-s', '--sub', action='append', type='int', dest='sub_col'), + ] + +defaults = { + 'prefix' : None, + 'keycol' : 0, + 'no_comments' : False, + 'colwidth' : 10, + 'precision' : 3, + 'indent' : 2, + 'merge_dirs' : [], + 'sub_col' : [], + } + + +class DataFileMerger(defapp.App): + def __init__(self): + defapp.App.__init__(self, options, defaults, no_std_opts=True) + self.out = sys.stdout + + def merge_file(self, bname, dirs, fout): + out = open(fout, 'w') + files = [os.path.join(d, bname) for d in dirs] + if not self.options.no_comments: + for src in files: + filter_comments(src, out=out) + for r in merged_data(files, self.options.keycol): + print_row(r, out, self.fmt) + out.close() + + + def sub_column(self, bname, dirs, fout): + out = open(fout, 'w') + files = [os.path.join(d, bname) for d in dirs] + if not self.options.no_comments: + for src in files: + filter_comments(src, out=out) + srcs = zip(self.options.sub_col, files[1:]) + print fout, '<-', files[0], '+', srcs + for r in substituted_data(files[0], srcs, + self.options.keycol): + print_row(r, out, self.fmt) + out.close() + + def target_name(self, fname, extra=None): + path, bname = os.path.split(fname) + if extra: + bname, ext = os.path.splitext(bname) + bname = "%s%s%s" % (bname, extra, ext) + return (self.options.prefix + bname, bname, [path] + self.options.merge_dirs) + + def merge(self): + if self.options.prefix: + for i, datafile in enumerate(self.args): + (fout, bname, dirs) = self.target_name(datafile) + print "[%d/%d] Merging {%s}/%s -> %s" % \ + (i + 1, len(self.args), ",".join(dirs), bname, fout) + try: + self.merge_file(bname, dirs, fout) + except IOError as ioe: + sys.stderr.write("[!!] ") + sys.stderr.write(str(ioe)) + sys.stderr.write("\n") + else: + print "Requires output prefix (-o)." + + def sub(self): + if self.options.prefix: + for i, datafile in enumerate(self.args): + (fout, bname, dirs) = self.target_name(datafile) + print "[%d/%d] Column-merging {%s}/%s -> %s" % \ + (i + 1, len(self.args), ",".join(dirs), bname, fout) + try: + self.sub_column(bname, dirs, fout) + except IOError as ioe: + sys.stderr.write("[!!] ") + sys.stderr.write(str(ioe)) + sys.stderr.write("\n") + else: + print "Requires output prefix (-o)." + + def default(self, _): + self.fmt = "%%%d.%df" % (self.options.colwidth, self.options.precision) + print self.options.sub_col + if self.options.sub_col: + self.sub() + else: + self.merge() + +if __name__ == '__main__': + DataFileMerger().launch() + -- cgit v1.2.2