diff options
| author | Bjoern B. Brandenburg <bbb@cs.unc.edu> | 2008-09-25 17:06:19 -0400 |
|---|---|---|
| committer | Bjoern B. Brandenburg <bbb@cs.unc.edu> | 2008-09-25 17:06:19 -0400 |
| commit | afbe6a6bf1e7248aa0bdaf506413d604c974002c (patch) | |
| tree | c86228c95ea915f534dc881627e6aa297ab55cce /csv_tool | |
| parent | 5e3ea2b647cab744481e1dfa19fa1683bcdbac4b (diff) | |
add support for linear regressions
Diffstat (limited to 'csv_tool')
| -rwxr-xr-x | csv_tool | 90 |
1 files changed, 65 insertions, 25 deletions
| @@ -11,34 +11,36 @@ import csv | |||
| 11 | import operator | 11 | import operator |
| 12 | import os.path | 12 | import os.path |
| 13 | from collections import defaultdict as defdict | 13 | from collections import defaultdict as defdict |
| 14 | from itertools import izip | ||
| 14 | 15 | ||
| 15 | o = optparse.make_option | 16 | o = optparse.make_option |
| 16 | 17 | ||
| 17 | opts = [ | 18 | opts = [ |
| 18 | 19 | ||
| 19 | o('-x', '--exchange', action='append', dest='col_xchg', | 20 | o('-c', '--column', action='append', dest='col', type='int', |
| 20 | nargs=2, type='int', | 21 | help='The column(s) on which to operate.'), |
| 21 | help='Columns that should be switched with reorder.'), | ||
| 22 | |||
| 23 | o('-c', '--column', action='store', dest='col', type='int', | ||
| 24 | help='The column on which to operate.'), | ||
| 25 | 22 | ||
| 26 | o(None, '--write-to-file', action='store_true', dest='write_to_file', | 23 | o(None, '--write-to-file', action='store_true', dest='write_to_file', |
| 27 | help='Write the output of operation xyz on file abc.csv to xyz_abc.csv.'), | 24 | help='Write the output of operation xyz on file abc.csv to xyz_abc.csv.'), |
| 28 | 25 | ||
| 29 | # o(None, '--true', action='store_true', dest='truth', | 26 | o('-u', '--upper-bound', action='store_true', dest='upper_bound', |
| 30 | # help='A boolean flag value.'), | 27 | help="The linear regression is modified to represent an upper bound."), |
| 31 | 28 | ||
| 32 | # o(None, '--degree', action='store', type='float', dest='thruthiness', | 29 | # o(None, '--degree', action='store', type='float', dest='thruthiness', |
| 33 | # help='Not quite absolut truth.'), | 30 | # help='Not quite absolut truth.'), |
| 34 | ] | 31 | ] |
| 35 | 32 | ||
| 36 | defaults = { | 33 | defaults = { |
| 37 | 'col' : 1, | 34 | 'col' : [], |
| 38 | 'col_xcgh' : [], | ||
| 39 | 'write_to_file' : False, | 35 | 'write_to_file' : False, |
| 36 | 'upper_bound' : False, | ||
| 40 | } | 37 | } |
| 41 | 38 | ||
| 39 | def pair_iter(it): | ||
| 40 | it = iter(it) | ||
| 41 | while True: | ||
| 42 | yield (it.next(), it.next()) | ||
| 43 | |||
| 42 | def make_vector_op(op): | 44 | def make_vector_op(op): |
| 43 | def vector_op(a, b, defvalue=0): | 45 | def vector_op(a, b, defvalue=0): |
| 44 | if len(a) > len(b): | 46 | if len(a) > len(b): |
| @@ -74,7 +76,7 @@ def row_reduce(row_op, fixup=lambda key, rows, res: res): | |||
| 74 | yield fixup(key, rows, res) | 76 | yield fixup(key, rows, res) |
| 75 | return _reduce | 77 | return _reduce |
| 76 | 78 | ||
| 77 | 79 | ||
| 78 | row_mul = make_scalar_op(operator.mul) | 80 | row_mul = make_scalar_op(operator.mul) |
| 79 | row_div = make_scalar_op(operator.div) | 81 | row_div = make_scalar_op(operator.div) |
| 80 | 82 | ||
| @@ -82,7 +84,7 @@ def transpose(rows): | |||
| 82 | rows = list(rows) | 84 | rows = list(rows) |
| 83 | if rows: | 85 | if rows: |
| 84 | r = len(rows) | 86 | r = len(rows) |
| 85 | c = max([len(x) for x in rows]) | 87 | c = max([len(x) for x in rows]) |
| 86 | def at(x, y): | 88 | def at(x, y): |
| 87 | try: | 89 | try: |
| 88 | return rows[x][y] | 90 | return rows[x][y] |
| @@ -91,14 +93,39 @@ def transpose(rows): | |||
| 91 | for i in xrange(c): | 93 | for i in xrange(c): |
| 92 | yield [at(j, i) for j in xrange(r) ] | 94 | yield [at(j, i) for j in xrange(r) ] |
| 93 | 95 | ||
| 94 | |||
| 95 | def reorder_columns(rows, xchg_pairs): | 96 | def reorder_columns(rows, xchg_pairs): |
| 96 | for r in rows: | 97 | for r in rows: |
| 97 | print type(r) | ||
| 98 | for (x,y) in xchg_pairs: | 98 | for (x,y) in xchg_pairs: |
| 99 | r[x], r[y] = r[y], r[x] | 99 | r[x], r[y] = r[y], r[x] |
| 100 | yield r | 100 | yield r |
| 101 | 101 | ||
| 102 | def select_columns(rows, cols): | ||
| 103 | for r in rows: | ||
| 104 | yield [r[x] for x in cols] | ||
| 105 | |||
| 106 | def numpy_lstsq(x, y): | ||
| 107 | from numpy import ones, array | ||
| 108 | from numpy.linalg import lstsq | ||
| 109 | A = ones((len(y), 2), dtype=float) | ||
| 110 | A[:,0] = array(x) | ||
| 111 | b = array(y) | ||
| 112 | return lstsq(A, b)[0] | ||
| 113 | |||
| 114 | def max_delta(c0, c1, x, y): | ||
| 115 | return max([abs(c0 + c1 * float(x) - float(y)) for (x,y) in izip(x, y)]) | ||
| 116 | |||
| 117 | def least_squares(rows, xy_pairs, upper_bound): | ||
| 118 | cols = [] | ||
| 119 | for (x, y) in xy_pairs: | ||
| 120 | cols += [x, y] | ||
| 121 | rows = select_columns(rows, cols) | ||
| 122 | cols = transpose(rows) | ||
| 123 | for ((x, y), (xval, yval)) in izip(xy_pairs, pair_iter(cols)): | ||
| 124 | c1, c0 = numpy_lstsq(xval, yval) | ||
| 125 | if upper_bound: | ||
| 126 | c0 += max_delta(c0, c1, xval, yval) | ||
| 127 | yield [x + 1, y + 1, c0, c1] | ||
| 128 | |||
| 102 | def select_by_key(rows, col, cast=None): | 129 | def select_by_key(rows, col, cast=None): |
| 103 | by_key = defdict(list) | 130 | by_key = defdict(list) |
| 104 | order = [] | 131 | order = [] |
| @@ -115,13 +142,17 @@ class CsvApp(defapp.App): | |||
| 115 | def __init__(self): | 142 | def __init__(self): |
| 116 | defapp.App.__init__(self, opts, defaults) | 143 | defapp.App.__init__(self, opts, defaults) |
| 117 | # fixup human-friendly offsets | 144 | # fixup human-friendly offsets |
| 118 | self.options.col -= 1 | 145 | if not self.options.col: |
| 119 | if self.options.col_xchg: | 146 | self.options.col = [1] |
| 120 | self.options.col_xchg = [(x - 1, y - 1) for (x, y) in | 147 | self.options.col = [x - 1 for x in self.options.col] |
| 121 | self.options.col_xchg] | 148 | self.options.col_pairs = list(pair_iter(self.options.col)) |
| 122 | 149 | ||
| 123 | def transform(self, make_iterator, ordered=True): | 150 | def transform(self, make_iterator, ordered=True): |
| 124 | """Average all rows with the same key in a given column.""" | 151 | """ |
| 152 | Read a file, pass the rows in the file to an iterator factory, and | ||
| 153 | write out the output of the iterator. The iterator performs the | ||
| 154 | desired transformation. | ||
| 155 | """ | ||
| 125 | files = list(self.args) | 156 | files = list(self.args) |
| 126 | del files[0] | 157 | del files[0] |
| 127 | for fn in files: | 158 | for fn in files: |
| @@ -130,7 +161,7 @@ class CsvApp(defapp.App): | |||
| 130 | rows = csv.reader(open(fn, 'r')) | 161 | rows = csv.reader(open(fn, 'r')) |
| 131 | # set up transformation | 162 | # set up transformation |
| 132 | if ordered: | 163 | if ordered: |
| 133 | (order, by_key) = select_by_key(rows, self.options.col, | 164 | (order, by_key) = select_by_key(rows, self.options.col[0], |
| 134 | float) | 165 | float) |
| 135 | rows = make_iterator(order, by_key) | 166 | rows = make_iterator(order, by_key) |
| 136 | else: | 167 | else: |
| @@ -152,10 +183,10 @@ class CsvApp(defapp.App): | |||
| 152 | def do_avg(self, _): | 183 | def do_avg(self, _): |
| 153 | def fixup_avg(key, rows, res): | 184 | def fixup_avg(key, rows, res): |
| 154 | res = row_div(len(rows), res) | 185 | res = row_div(len(rows), res) |
| 155 | res[self.options.col] = key | 186 | res[self.options.col[0]] = key |
| 156 | return res | 187 | return res |
| 157 | self.transform(row_reduce(row_add, fixup_avg)) | 188 | self.transform(row_reduce(row_add, fixup_avg)) |
| 158 | 189 | ||
| 159 | def do_max(self, _): | 190 | def do_max(self, _): |
| 160 | self.transform(row_reduce(row_max)) | 191 | self.transform(row_reduce(row_max)) |
| 161 | 192 | ||
| @@ -166,9 +197,18 @@ class CsvApp(defapp.App): | |||
| 166 | self.transform(transpose, ordered=False) | 197 | self.transform(transpose, ordered=False) |
| 167 | 198 | ||
| 168 | def do_reorder(self, _): | 199 | def do_reorder(self, _): |
| 169 | self.transform(lambda rows: reorder_columns(rows, | 200 | self.transform(lambda rows: reorder_columns( |
| 170 | self.options.col_xchg), | 201 | rows, self.options.col_pairs), ordered=False) |
| 171 | ordered=False) | 202 | |
| 203 | def do_select(self, _): | ||
| 204 | self.transform(lambda rows: select_columns( | ||
| 205 | rows, self.options.col), ordered=False) | ||
| 206 | |||
| 207 | def do_lstsqrs(self, _): | ||
| 208 | self.transform( | ||
| 209 | lambda rows: least_squares(rows, self.options.col_pairs, | ||
| 210 | self.options.upper_bound), | ||
| 211 | ordered=False) | ||
| 172 | 212 | ||
| 173 | if __name__ == '__main__': | 213 | if __name__ == '__main__': |
| 174 | CsvApp().launch() | 214 | CsvApp().launch() |
