diff options
author | Bjoern B. Brandenburg <bbb@cs.unc.edu> | 2008-09-25 17:06:19 -0400 |
---|---|---|
committer | Bjoern B. Brandenburg <bbb@cs.unc.edu> | 2008-09-25 17:06:19 -0400 |
commit | afbe6a6bf1e7248aa0bdaf506413d604c974002c (patch) | |
tree | c86228c95ea915f534dc881627e6aa297ab55cce | |
parent | 5e3ea2b647cab744481e1dfa19fa1683bcdbac4b (diff) |
add support for linear regressions
-rwxr-xr-x | csv_tool | 90 |
1 files changed, 65 insertions, 25 deletions
@@ -11,34 +11,36 @@ import csv | |||
11 | import operator | 11 | import operator |
12 | import os.path | 12 | import os.path |
13 | from collections import defaultdict as defdict | 13 | from collections import defaultdict as defdict |
14 | from itertools import izip | ||
14 | 15 | ||
15 | o = optparse.make_option | 16 | o = optparse.make_option |
16 | 17 | ||
17 | opts = [ | 18 | opts = [ |
18 | 19 | ||
19 | o('-x', '--exchange', action='append', dest='col_xchg', | 20 | o('-c', '--column', action='append', dest='col', type='int', |
20 | nargs=2, type='int', | 21 | help='The column(s) on which to operate.'), |
21 | help='Columns that should be switched with reorder.'), | ||
22 | |||
23 | o('-c', '--column', action='store', dest='col', type='int', | ||
24 | help='The column on which to operate.'), | ||
25 | 22 | ||
26 | o(None, '--write-to-file', action='store_true', dest='write_to_file', | 23 | o(None, '--write-to-file', action='store_true', dest='write_to_file', |
27 | help='Write the output of operation xyz on file abc.csv to xyz_abc.csv.'), | 24 | help='Write the output of operation xyz on file abc.csv to xyz_abc.csv.'), |
28 | 25 | ||
29 | # o(None, '--true', action='store_true', dest='truth', | 26 | o('-u', '--upper-bound', action='store_true', dest='upper_bound', |
30 | # help='A boolean flag value.'), | 27 | help="The linear regression is modified to represent an upper bound."), |
31 | 28 | ||
32 | # o(None, '--degree', action='store', type='float', dest='thruthiness', | 29 | # o(None, '--degree', action='store', type='float', dest='thruthiness', |
33 | # help='Not quite absolut truth.'), | 30 | # help='Not quite absolut truth.'), |
34 | ] | 31 | ] |
35 | 32 | ||
36 | defaults = { | 33 | defaults = { |
37 | 'col' : 1, | 34 | 'col' : [], |
38 | 'col_xcgh' : [], | ||
39 | 'write_to_file' : False, | 35 | 'write_to_file' : False, |
36 | 'upper_bound' : False, | ||
40 | } | 37 | } |
41 | 38 | ||
39 | def pair_iter(it): | ||
40 | it = iter(it) | ||
41 | while True: | ||
42 | yield (it.next(), it.next()) | ||
43 | |||
42 | def make_vector_op(op): | 44 | def make_vector_op(op): |
43 | def vector_op(a, b, defvalue=0): | 45 | def vector_op(a, b, defvalue=0): |
44 | if len(a) > len(b): | 46 | if len(a) > len(b): |
@@ -74,7 +76,7 @@ def row_reduce(row_op, fixup=lambda key, rows, res: res): | |||
74 | yield fixup(key, rows, res) | 76 | yield fixup(key, rows, res) |
75 | return _reduce | 77 | return _reduce |
76 | 78 | ||
77 | 79 | ||
78 | row_mul = make_scalar_op(operator.mul) | 80 | row_mul = make_scalar_op(operator.mul) |
79 | row_div = make_scalar_op(operator.div) | 81 | row_div = make_scalar_op(operator.div) |
80 | 82 | ||
@@ -82,7 +84,7 @@ def transpose(rows): | |||
82 | rows = list(rows) | 84 | rows = list(rows) |
83 | if rows: | 85 | if rows: |
84 | r = len(rows) | 86 | r = len(rows) |
85 | c = max([len(x) for x in rows]) | 87 | c = max([len(x) for x in rows]) |
86 | def at(x, y): | 88 | def at(x, y): |
87 | try: | 89 | try: |
88 | return rows[x][y] | 90 | return rows[x][y] |
@@ -91,14 +93,39 @@ def transpose(rows): | |||
91 | for i in xrange(c): | 93 | for i in xrange(c): |
92 | yield [at(j, i) for j in xrange(r) ] | 94 | yield [at(j, i) for j in xrange(r) ] |
93 | 95 | ||
94 | |||
95 | def reorder_columns(rows, xchg_pairs): | 96 | def reorder_columns(rows, xchg_pairs): |
96 | for r in rows: | 97 | for r in rows: |
97 | print type(r) | ||
98 | for (x,y) in xchg_pairs: | 98 | for (x,y) in xchg_pairs: |
99 | r[x], r[y] = r[y], r[x] | 99 | r[x], r[y] = r[y], r[x] |
100 | yield r | 100 | yield r |
101 | 101 | ||
102 | def select_columns(rows, cols): | ||
103 | for r in rows: | ||
104 | yield [r[x] for x in cols] | ||
105 | |||
106 | def numpy_lstsq(x, y): | ||
107 | from numpy import ones, array | ||
108 | from numpy.linalg import lstsq | ||
109 | A = ones((len(y), 2), dtype=float) | ||
110 | A[:,0] = array(x) | ||
111 | b = array(y) | ||
112 | return lstsq(A, b)[0] | ||
113 | |||
114 | def max_delta(c0, c1, x, y): | ||
115 | return max([abs(c0 + c1 * float(x) - float(y)) for (x,y) in izip(x, y)]) | ||
116 | |||
117 | def least_squares(rows, xy_pairs, upper_bound): | ||
118 | cols = [] | ||
119 | for (x, y) in xy_pairs: | ||
120 | cols += [x, y] | ||
121 | rows = select_columns(rows, cols) | ||
122 | cols = transpose(rows) | ||
123 | for ((x, y), (xval, yval)) in izip(xy_pairs, pair_iter(cols)): | ||
124 | c1, c0 = numpy_lstsq(xval, yval) | ||
125 | if upper_bound: | ||
126 | c0 += max_delta(c0, c1, xval, yval) | ||
127 | yield [x + 1, y + 1, c0, c1] | ||
128 | |||
102 | def select_by_key(rows, col, cast=None): | 129 | def select_by_key(rows, col, cast=None): |
103 | by_key = defdict(list) | 130 | by_key = defdict(list) |
104 | order = [] | 131 | order = [] |
@@ -115,13 +142,17 @@ class CsvApp(defapp.App): | |||
115 | def __init__(self): | 142 | def __init__(self): |
116 | defapp.App.__init__(self, opts, defaults) | 143 | defapp.App.__init__(self, opts, defaults) |
117 | # fixup human-friendly offsets | 144 | # fixup human-friendly offsets |
118 | self.options.col -= 1 | 145 | if not self.options.col: |
119 | if self.options.col_xchg: | 146 | self.options.col = [1] |
120 | self.options.col_xchg = [(x - 1, y - 1) for (x, y) in | 147 | self.options.col = [x - 1 for x in self.options.col] |
121 | self.options.col_xchg] | 148 | self.options.col_pairs = list(pair_iter(self.options.col)) |
122 | 149 | ||
123 | def transform(self, make_iterator, ordered=True): | 150 | def transform(self, make_iterator, ordered=True): |
124 | """Average all rows with the same key in a given column.""" | 151 | """ |
152 | Read a file, pass the rows in the file to an iterator factory, and | ||
153 | write out the output of the iterator. The iterator performs the | ||
154 | desired transformation. | ||
155 | """ | ||
125 | files = list(self.args) | 156 | files = list(self.args) |
126 | del files[0] | 157 | del files[0] |
127 | for fn in files: | 158 | for fn in files: |
@@ -130,7 +161,7 @@ class CsvApp(defapp.App): | |||
130 | rows = csv.reader(open(fn, 'r')) | 161 | rows = csv.reader(open(fn, 'r')) |
131 | # set up transformation | 162 | # set up transformation |
132 | if ordered: | 163 | if ordered: |
133 | (order, by_key) = select_by_key(rows, self.options.col, | 164 | (order, by_key) = select_by_key(rows, self.options.col[0], |
134 | float) | 165 | float) |
135 | rows = make_iterator(order, by_key) | 166 | rows = make_iterator(order, by_key) |
136 | else: | 167 | else: |
@@ -152,10 +183,10 @@ class CsvApp(defapp.App): | |||
152 | def do_avg(self, _): | 183 | def do_avg(self, _): |
153 | def fixup_avg(key, rows, res): | 184 | def fixup_avg(key, rows, res): |
154 | res = row_div(len(rows), res) | 185 | res = row_div(len(rows), res) |
155 | res[self.options.col] = key | 186 | res[self.options.col[0]] = key |
156 | return res | 187 | return res |
157 | self.transform(row_reduce(row_add, fixup_avg)) | 188 | self.transform(row_reduce(row_add, fixup_avg)) |
158 | 189 | ||
159 | def do_max(self, _): | 190 | def do_max(self, _): |
160 | self.transform(row_reduce(row_max)) | 191 | self.transform(row_reduce(row_max)) |
161 | 192 | ||
@@ -166,9 +197,18 @@ class CsvApp(defapp.App): | |||
166 | self.transform(transpose, ordered=False) | 197 | self.transform(transpose, ordered=False) |
167 | 198 | ||
168 | def do_reorder(self, _): | 199 | def do_reorder(self, _): |
169 | self.transform(lambda rows: reorder_columns(rows, | 200 | self.transform(lambda rows: reorder_columns( |
170 | self.options.col_xchg), | 201 | rows, self.options.col_pairs), ordered=False) |
171 | ordered=False) | 202 | |
203 | def do_select(self, _): | ||
204 | self.transform(lambda rows: select_columns( | ||
205 | rows, self.options.col), ordered=False) | ||
206 | |||
207 | def do_lstsqrs(self, _): | ||
208 | self.transform( | ||
209 | lambda rows: least_squares(rows, self.options.col_pairs, | ||
210 | self.options.upper_bound), | ||
211 | ordered=False) | ||
172 | 212 | ||
173 | if __name__ == '__main__': | 213 | if __name__ == '__main__': |
174 | CsvApp().launch() | 214 | CsvApp().launch() |