aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xcsv_tool90
1 files changed, 65 insertions, 25 deletions
diff --git a/csv_tool b/csv_tool
index e747462..96cd99e 100755
--- a/csv_tool
+++ b/csv_tool
@@ -11,34 +11,36 @@ import csv
11import operator 11import operator
12import os.path 12import os.path
13from collections import defaultdict as defdict 13from collections import defaultdict as defdict
14from itertools import izip
14 15
15o = optparse.make_option 16o = optparse.make_option
16 17
17opts = [ 18opts = [
18 19
19 o('-x', '--exchange', action='append', dest='col_xchg', 20 o('-c', '--column', action='append', dest='col', type='int',
20 nargs=2, type='int', 21 help='The column(s) on which to operate.'),
21 help='Columns that should be switched with reorder.'),
22
23 o('-c', '--column', action='store', dest='col', type='int',
24 help='The column on which to operate.'),
25 22
26 o(None, '--write-to-file', action='store_true', dest='write_to_file', 23 o(None, '--write-to-file', action='store_true', dest='write_to_file',
27 help='Write the output of operation xyz on file abc.csv to xyz_abc.csv.'), 24 help='Write the output of operation xyz on file abc.csv to xyz_abc.csv.'),
28 25
29# o(None, '--true', action='store_true', dest='truth', 26 o('-u', '--upper-bound', action='store_true', dest='upper_bound',
30# help='A boolean flag value.'), 27 help="The linear regression is modified to represent an upper bound."),
31 28
32# o(None, '--degree', action='store', type='float', dest='thruthiness', 29# o(None, '--degree', action='store', type='float', dest='thruthiness',
33# help='Not quite absolut truth.'), 30# help='Not quite absolut truth.'),
34 ] 31 ]
35 32
36defaults = { 33defaults = {
37 'col' : 1, 34 'col' : [],
38 'col_xcgh' : [],
39 'write_to_file' : False, 35 'write_to_file' : False,
36 'upper_bound' : False,
40 } 37 }
41 38
39def pair_iter(it):
40 it = iter(it)
41 while True:
42 yield (it.next(), it.next())
43
42def make_vector_op(op): 44def make_vector_op(op):
43 def vector_op(a, b, defvalue=0): 45 def vector_op(a, b, defvalue=0):
44 if len(a) > len(b): 46 if len(a) > len(b):
@@ -74,7 +76,7 @@ def row_reduce(row_op, fixup=lambda key, rows, res: res):
74 yield fixup(key, rows, res) 76 yield fixup(key, rows, res)
75 return _reduce 77 return _reduce
76 78
77 79
78row_mul = make_scalar_op(operator.mul) 80row_mul = make_scalar_op(operator.mul)
79row_div = make_scalar_op(operator.div) 81row_div = make_scalar_op(operator.div)
80 82
@@ -82,7 +84,7 @@ def transpose(rows):
82 rows = list(rows) 84 rows = list(rows)
83 if rows: 85 if rows:
84 r = len(rows) 86 r = len(rows)
85 c = max([len(x) for x in rows]) 87 c = max([len(x) for x in rows])
86 def at(x, y): 88 def at(x, y):
87 try: 89 try:
88 return rows[x][y] 90 return rows[x][y]
@@ -91,14 +93,39 @@ def transpose(rows):
91 for i in xrange(c): 93 for i in xrange(c):
92 yield [at(j, i) for j in xrange(r) ] 94 yield [at(j, i) for j in xrange(r) ]
93 95
94
95def reorder_columns(rows, xchg_pairs): 96def reorder_columns(rows, xchg_pairs):
96 for r in rows: 97 for r in rows:
97 print type(r)
98 for (x,y) in xchg_pairs: 98 for (x,y) in xchg_pairs:
99 r[x], r[y] = r[y], r[x] 99 r[x], r[y] = r[y], r[x]
100 yield r 100 yield r
101 101
102def select_columns(rows, cols):
103 for r in rows:
104 yield [r[x] for x in cols]
105
106def numpy_lstsq(x, y):
107 from numpy import ones, array
108 from numpy.linalg import lstsq
109 A = ones((len(y), 2), dtype=float)
110 A[:,0] = array(x)
111 b = array(y)
112 return lstsq(A, b)[0]
113
114def max_delta(c0, c1, x, y):
115 return max([abs(c0 + c1 * float(x) - float(y)) for (x,y) in izip(x, y)])
116
117def least_squares(rows, xy_pairs, upper_bound):
118 cols = []
119 for (x, y) in xy_pairs:
120 cols += [x, y]
121 rows = select_columns(rows, cols)
122 cols = transpose(rows)
123 for ((x, y), (xval, yval)) in izip(xy_pairs, pair_iter(cols)):
124 c1, c0 = numpy_lstsq(xval, yval)
125 if upper_bound:
126 c0 += max_delta(c0, c1, xval, yval)
127 yield [x + 1, y + 1, c0, c1]
128
102def select_by_key(rows, col, cast=None): 129def select_by_key(rows, col, cast=None):
103 by_key = defdict(list) 130 by_key = defdict(list)
104 order = [] 131 order = []
@@ -115,13 +142,17 @@ class CsvApp(defapp.App):
115 def __init__(self): 142 def __init__(self):
116 defapp.App.__init__(self, opts, defaults) 143 defapp.App.__init__(self, opts, defaults)
117 # fixup human-friendly offsets 144 # fixup human-friendly offsets
118 self.options.col -= 1 145 if not self.options.col:
119 if self.options.col_xchg: 146 self.options.col = [1]
120 self.options.col_xchg = [(x - 1, y - 1) for (x, y) in 147 self.options.col = [x - 1 for x in self.options.col]
121 self.options.col_xchg] 148 self.options.col_pairs = list(pair_iter(self.options.col))
122 149
123 def transform(self, make_iterator, ordered=True): 150 def transform(self, make_iterator, ordered=True):
124 """Average all rows with the same key in a given column.""" 151 """
152 Read a file, pass the rows in the file to an iterator factory, and
153 write out the output of the iterator. The iterator performs the
154 desired transformation.
155 """
125 files = list(self.args) 156 files = list(self.args)
126 del files[0] 157 del files[0]
127 for fn in files: 158 for fn in files:
@@ -130,7 +161,7 @@ class CsvApp(defapp.App):
130 rows = csv.reader(open(fn, 'r')) 161 rows = csv.reader(open(fn, 'r'))
131 # set up transformation 162 # set up transformation
132 if ordered: 163 if ordered:
133 (order, by_key) = select_by_key(rows, self.options.col, 164 (order, by_key) = select_by_key(rows, self.options.col[0],
134 float) 165 float)
135 rows = make_iterator(order, by_key) 166 rows = make_iterator(order, by_key)
136 else: 167 else:
@@ -152,10 +183,10 @@ class CsvApp(defapp.App):
152 def do_avg(self, _): 183 def do_avg(self, _):
153 def fixup_avg(key, rows, res): 184 def fixup_avg(key, rows, res):
154 res = row_div(len(rows), res) 185 res = row_div(len(rows), res)
155 res[self.options.col] = key 186 res[self.options.col[0]] = key
156 return res 187 return res
157 self.transform(row_reduce(row_add, fixup_avg)) 188 self.transform(row_reduce(row_add, fixup_avg))
158 189
159 def do_max(self, _): 190 def do_max(self, _):
160 self.transform(row_reduce(row_max)) 191 self.transform(row_reduce(row_max))
161 192
@@ -166,9 +197,18 @@ class CsvApp(defapp.App):
166 self.transform(transpose, ordered=False) 197 self.transform(transpose, ordered=False)
167 198
168 def do_reorder(self, _): 199 def do_reorder(self, _):
169 self.transform(lambda rows: reorder_columns(rows, 200 self.transform(lambda rows: reorder_columns(
170 self.options.col_xchg), 201 rows, self.options.col_pairs), ordered=False)
171 ordered=False) 202
203 def do_select(self, _):
204 self.transform(lambda rows: select_columns(
205 rows, self.options.col), ordered=False)
206
207 def do_lstsqrs(self, _):
208 self.transform(
209 lambda rows: least_squares(rows, self.options.col_pairs,
210 self.options.upper_bound),
211 ordered=False)
172 212
173if __name__ == '__main__': 213if __name__ == '__main__':
174 CsvApp().launch() 214 CsvApp().launch()