diff options
| author | Bjoern Brandenburg <bbb@bbb1-cs.cs.unc.edu> | 2008-09-04 20:48:46 -0400 |
|---|---|---|
| committer | Bjoern Brandenburg <bbb@bbb1-cs.cs.unc.edu> | 2008-09-04 20:48:46 -0400 |
| commit | 8a32b55ce1c25580da379555b2c4a5f149cfd43b (patch) | |
| tree | 82af205c47bbba9a6bd3e0bc1404101c09f1b8ce /csv_tool | |
| parent | 924e8a861c6fc0ce73efa6caf1525404fedb4494 (diff) | |
started work on a csv transformation tool
Diffstat (limited to 'csv_tool')
| -rwxr-xr-x | csv_tool | 118 |
1 files changed, 118 insertions, 0 deletions
diff --git a/csv_tool b/csv_tool new file mode 100755 index 0000000..455037c --- /dev/null +++ b/csv_tool | |||
| @@ -0,0 +1,118 @@ | |||
| 1 | #!/usr/bin/env python | ||
| 2 | |||
| 3 | """ | ||
| 4 | Do stuff with csv files. | ||
| 5 | """ | ||
| 6 | |||
| 7 | import optparse | ||
| 8 | import defapp | ||
| 9 | |||
| 10 | import csv | ||
| 11 | import operator | ||
| 12 | from collections import defaultdict as defdict | ||
| 13 | |||
| 14 | o = optparse.make_option | ||
| 15 | |||
| 16 | opts = [ | ||
| 17 | |||
| 18 | # o('-t', '--two', action='store', dest='double_val', nargs=2, type='int', | ||
| 19 | # help='A two-parameter option.'), | ||
| 20 | |||
| 21 | o('-c', '--column', action='store', dest='col', type='int', | ||
| 22 | help='The column on which to operate.'), | ||
| 23 | |||
| 24 | # o(None, '--true', action='store_true', dest='truth', | ||
| 25 | # help='A boolean flag value.'), | ||
| 26 | |||
| 27 | # o(None, '--degree', action='store', type='float', dest='thruthiness', | ||
| 28 | # help='Not quite absolut truth.'), | ||
| 29 | ] | ||
| 30 | |||
| 31 | defaults = { | ||
| 32 | 'col' : 0, | ||
| 33 | } | ||
| 34 | |||
| 35 | def make_vector_op(op): | ||
| 36 | def vector_op(a, b, defvalue=0): | ||
| 37 | if len(a) > len(b): | ||
| 38 | shorter = b | ||
| 39 | longer = a | ||
| 40 | else: | ||
| 41 | shorter = a | ||
| 42 | longer = b | ||
| 43 | c = list(longer) | ||
| 44 | for i in xrange(len(shorter)): | ||
| 45 | c[i] = op(longer[i], shorter[i]) | ||
| 46 | for i in xrange(len(shorter), len(longer)): | ||
| 47 | c[i] = op(longer[i], defvalue) | ||
| 48 | return c | ||
| 49 | return vector_op | ||
| 50 | |||
| 51 | def make_scalar_op(op): | ||
| 52 | def scalar_op(scalar, a): | ||
| 53 | return [op(x, scalar) for x in a] | ||
| 54 | return scalar_op | ||
| 55 | |||
| 56 | row_add = make_vector_op(operator.add) | ||
| 57 | row_min = make_vector_op(min) | ||
| 58 | row_max = make_vector_op(max) | ||
| 59 | |||
| 60 | def row_reduce(row_op, fixup=lambda key, rows, res: res): | ||
| 61 | def _reduce(order, by_key): | ||
| 62 | for key in order: | ||
| 63 | if key in by_key: | ||
| 64 | rows = by_key[key] | ||
| 65 | res = reduce(row_op, rows) | ||
| 66 | del by_key[key] | ||
| 67 | yield fixup(key, rows, res) | ||
| 68 | return _reduce | ||
| 69 | |||
| 70 | |||
| 71 | row_mul = make_scalar_op(operator.mul) | ||
| 72 | row_div = make_scalar_op(operator.div) | ||
| 73 | |||
| 74 | def select_by_key(rows, col, cast=None): | ||
| 75 | by_key = defdict(list) | ||
| 76 | order = [] | ||
| 77 | for r in rows: | ||
| 78 | key = r[col] | ||
| 79 | if cast: | ||
| 80 | by_key[key] += [[cast(x) for x in r]] | ||
| 81 | else: | ||
| 82 | by_key[key] += [r] | ||
| 83 | order += [key] | ||
| 84 | return (order, by_key) | ||
| 85 | |||
| 86 | class CsvApp(defapp.App): | ||
| 87 | def __init__(self): | ||
| 88 | defapp.App.__init__(self, opts, defaults) | ||
| 89 | |||
| 90 | def ordered_transform(self, make_iterator): | ||
| 91 | """Average all rows with the same key in a given column.""" | ||
| 92 | files = list(self.args) | ||
| 93 | del files[0] | ||
| 94 | try: | ||
| 95 | for fn in files: | ||
| 96 | # read in content | ||
| 97 | (order, by_key) = select_by_key(csv.reader(open(fn, 'r')), | ||
| 98 | self.options.col, float) | ||
| 99 | # write out | ||
| 100 | csv.writer(self.outfile()).writerows(make_iterator(order, by_key)) | ||
| 101 | except IOError, ex: | ||
| 102 | print "Error:", ex | ||
| 103 | |||
| 104 | def do_avg(self, _): | ||
| 105 | def fixup_avg(key, rows, res): | ||
| 106 | res = row_div(len(rows), res) | ||
| 107 | res[self.options.col] = key | ||
| 108 | return res | ||
| 109 | self.ordered_transform(row_reduce(row_add, fixup_avg)) | ||
| 110 | |||
| 111 | def do_max(self, _): | ||
| 112 | self.ordered_transform(row_reduce(row_max)) | ||
| 113 | |||
| 114 | def do_min(self, _): | ||
| 115 | self.ordered_transform(row_reduce(row_min)) | ||
| 116 | |||
| 117 | if __name__ == '__main__': | ||
| 118 | CsvApp().launch() | ||
