diff options
author | Bjoern Brandenburg <bbb@bbb1-cs.cs.unc.edu> | 2008-09-04 20:48:46 -0400 |
---|---|---|
committer | Bjoern Brandenburg <bbb@bbb1-cs.cs.unc.edu> | 2008-09-04 20:48:46 -0400 |
commit | 8a32b55ce1c25580da379555b2c4a5f149cfd43b (patch) | |
tree | 82af205c47bbba9a6bd3e0bc1404101c09f1b8ce /csv_tool | |
parent | 924e8a861c6fc0ce73efa6caf1525404fedb4494 (diff) |
started work on a csv transformation tool
Diffstat (limited to 'csv_tool')
-rwxr-xr-x | csv_tool | 118 |
1 files changed, 118 insertions, 0 deletions
diff --git a/csv_tool b/csv_tool new file mode 100755 index 0000000..455037c --- /dev/null +++ b/csv_tool | |||
@@ -0,0 +1,118 @@ | |||
1 | #!/usr/bin/env python | ||
2 | |||
3 | """ | ||
4 | Do stuff with csv files. | ||
5 | """ | ||
6 | |||
7 | import optparse | ||
8 | import defapp | ||
9 | |||
10 | import csv | ||
11 | import operator | ||
12 | from collections import defaultdict as defdict | ||
13 | |||
14 | o = optparse.make_option | ||
15 | |||
16 | opts = [ | ||
17 | |||
18 | # o('-t', '--two', action='store', dest='double_val', nargs=2, type='int', | ||
19 | # help='A two-parameter option.'), | ||
20 | |||
21 | o('-c', '--column', action='store', dest='col', type='int', | ||
22 | help='The column on which to operate.'), | ||
23 | |||
24 | # o(None, '--true', action='store_true', dest='truth', | ||
25 | # help='A boolean flag value.'), | ||
26 | |||
27 | # o(None, '--degree', action='store', type='float', dest='thruthiness', | ||
28 | # help='Not quite absolut truth.'), | ||
29 | ] | ||
30 | |||
31 | defaults = { | ||
32 | 'col' : 0, | ||
33 | } | ||
34 | |||
35 | def make_vector_op(op): | ||
36 | def vector_op(a, b, defvalue=0): | ||
37 | if len(a) > len(b): | ||
38 | shorter = b | ||
39 | longer = a | ||
40 | else: | ||
41 | shorter = a | ||
42 | longer = b | ||
43 | c = list(longer) | ||
44 | for i in xrange(len(shorter)): | ||
45 | c[i] = op(longer[i], shorter[i]) | ||
46 | for i in xrange(len(shorter), len(longer)): | ||
47 | c[i] = op(longer[i], defvalue) | ||
48 | return c | ||
49 | return vector_op | ||
50 | |||
51 | def make_scalar_op(op): | ||
52 | def scalar_op(scalar, a): | ||
53 | return [op(x, scalar) for x in a] | ||
54 | return scalar_op | ||
55 | |||
56 | row_add = make_vector_op(operator.add) | ||
57 | row_min = make_vector_op(min) | ||
58 | row_max = make_vector_op(max) | ||
59 | |||
60 | def row_reduce(row_op, fixup=lambda key, rows, res: res): | ||
61 | def _reduce(order, by_key): | ||
62 | for key in order: | ||
63 | if key in by_key: | ||
64 | rows = by_key[key] | ||
65 | res = reduce(row_op, rows) | ||
66 | del by_key[key] | ||
67 | yield fixup(key, rows, res) | ||
68 | return _reduce | ||
69 | |||
70 | |||
71 | row_mul = make_scalar_op(operator.mul) | ||
72 | row_div = make_scalar_op(operator.div) | ||
73 | |||
74 | def select_by_key(rows, col, cast=None): | ||
75 | by_key = defdict(list) | ||
76 | order = [] | ||
77 | for r in rows: | ||
78 | key = r[col] | ||
79 | if cast: | ||
80 | by_key[key] += [[cast(x) for x in r]] | ||
81 | else: | ||
82 | by_key[key] += [r] | ||
83 | order += [key] | ||
84 | return (order, by_key) | ||
85 | |||
86 | class CsvApp(defapp.App): | ||
87 | def __init__(self): | ||
88 | defapp.App.__init__(self, opts, defaults) | ||
89 | |||
90 | def ordered_transform(self, make_iterator): | ||
91 | """Average all rows with the same key in a given column.""" | ||
92 | files = list(self.args) | ||
93 | del files[0] | ||
94 | try: | ||
95 | for fn in files: | ||
96 | # read in content | ||
97 | (order, by_key) = select_by_key(csv.reader(open(fn, 'r')), | ||
98 | self.options.col, float) | ||
99 | # write out | ||
100 | csv.writer(self.outfile()).writerows(make_iterator(order, by_key)) | ||
101 | except IOError, ex: | ||
102 | print "Error:", ex | ||
103 | |||
104 | def do_avg(self, _): | ||
105 | def fixup_avg(key, rows, res): | ||
106 | res = row_div(len(rows), res) | ||
107 | res[self.options.col] = key | ||
108 | return res | ||
109 | self.ordered_transform(row_reduce(row_add, fixup_avg)) | ||
110 | |||
111 | def do_max(self, _): | ||
112 | self.ordered_transform(row_reduce(row_max)) | ||
113 | |||
114 | def do_min(self, _): | ||
115 | self.ordered_transform(row_reduce(row_min)) | ||
116 | |||
117 | if __name__ == '__main__': | ||
118 | CsvApp().launch() | ||