From 8a32b55ce1c25580da379555b2c4a5f149cfd43b Mon Sep 17 00:00:00 2001 From: Bjoern Brandenburg Date: Thu, 4 Sep 2008 20:48:46 -0400 Subject: started work on a csv transformation tool --- csv_tool | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100755 csv_tool (limited to 'csv_tool') diff --git a/csv_tool b/csv_tool new file mode 100755 index 0000000..455037c --- /dev/null +++ b/csv_tool @@ -0,0 +1,118 @@ +#!/usr/bin/env python + +""" +Do stuff with csv files. +""" + +import optparse +import defapp + +import csv +import operator +from collections import defaultdict as defdict + +o = optparse.make_option + +opts = [ + +# o('-t', '--two', action='store', dest='double_val', nargs=2, type='int', +# help='A two-parameter option.'), + + o('-c', '--column', action='store', dest='col', type='int', + help='The column on which to operate.'), + +# o(None, '--true', action='store_true', dest='truth', +# help='A boolean flag value.'), + +# o(None, '--degree', action='store', type='float', dest='thruthiness', +# help='Not quite absolut truth.'), + ] + +defaults = { + 'col' : 0, + } + +def make_vector_op(op): + def vector_op(a, b, defvalue=0): + if len(a) > len(b): + shorter = b + longer = a + else: + shorter = a + longer = b + c = list(longer) + for i in xrange(len(shorter)): + c[i] = op(longer[i], shorter[i]) + for i in xrange(len(shorter), len(longer)): + c[i] = op(longer[i], defvalue) + return c + return vector_op + +def make_scalar_op(op): + def scalar_op(scalar, a): + return [op(x, scalar) for x in a] + return scalar_op + +row_add = make_vector_op(operator.add) +row_min = make_vector_op(min) +row_max = make_vector_op(max) + +def row_reduce(row_op, fixup=lambda key, rows, res: res): + def _reduce(order, by_key): + for key in order: + if key in by_key: + rows = by_key[key] + res = reduce(row_op, rows) + del by_key[key] + yield fixup(key, rows, res) + return _reduce + + +row_mul = make_scalar_op(operator.mul) +row_div = make_scalar_op(operator.div) + +def select_by_key(rows, col, cast=None): + by_key = defdict(list) + order = [] + for r in rows: + key = r[col] + if cast: + by_key[key] += [[cast(x) for x in r]] + else: + by_key[key] += [r] + order += [key] + return (order, by_key) + +class CsvApp(defapp.App): + def __init__(self): + defapp.App.__init__(self, opts, defaults) + + def ordered_transform(self, make_iterator): + """Average all rows with the same key in a given column.""" + files = list(self.args) + del files[0] + try: + for fn in files: + # read in content + (order, by_key) = select_by_key(csv.reader(open(fn, 'r')), + self.options.col, float) + # write out + csv.writer(self.outfile()).writerows(make_iterator(order, by_key)) + except IOError, ex: + print "Error:", ex + + def do_avg(self, _): + def fixup_avg(key, rows, res): + res = row_div(len(rows), res) + res[self.options.col] = key + return res + self.ordered_transform(row_reduce(row_add, fixup_avg)) + + def do_max(self, _): + self.ordered_transform(row_reduce(row_max)) + + def do_min(self, _): + self.ordered_transform(row_reduce(row_min)) + +if __name__ == '__main__': + CsvApp().launch() -- cgit v1.2.2