From afbe6a6bf1e7248aa0bdaf506413d604c974002c Mon Sep 17 00:00:00 2001 From: "Bjoern B. Brandenburg" Date: Thu, 25 Sep 2008 17:06:19 -0400 Subject: add support for linear regressions --- csv_tool | 90 ++++++++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 65 insertions(+), 25 deletions(-) (limited to 'csv_tool') diff --git a/csv_tool b/csv_tool index e747462..96cd99e 100755 --- a/csv_tool +++ b/csv_tool @@ -11,34 +11,36 @@ import csv import operator import os.path from collections import defaultdict as defdict +from itertools import izip o = optparse.make_option opts = [ - o('-x', '--exchange', action='append', dest='col_xchg', - nargs=2, type='int', - help='Columns that should be switched with reorder.'), - - o('-c', '--column', action='store', dest='col', type='int', - help='The column on which to operate.'), + o('-c', '--column', action='append', dest='col', type='int', + help='The column(s) on which to operate.'), o(None, '--write-to-file', action='store_true', dest='write_to_file', help='Write the output of operation xyz on file abc.csv to xyz_abc.csv.'), -# o(None, '--true', action='store_true', dest='truth', -# help='A boolean flag value.'), + o('-u', '--upper-bound', action='store_true', dest='upper_bound', + help="The linear regression is modified to represent an upper bound."), # o(None, '--degree', action='store', type='float', dest='thruthiness', # help='Not quite absolut truth.'), ] defaults = { - 'col' : 1, - 'col_xcgh' : [], + 'col' : [], 'write_to_file' : False, + 'upper_bound' : False, } +def pair_iter(it): + it = iter(it) + while True: + yield (it.next(), it.next()) + def make_vector_op(op): def vector_op(a, b, defvalue=0): if len(a) > len(b): @@ -74,7 +76,7 @@ def row_reduce(row_op, fixup=lambda key, rows, res: res): yield fixup(key, rows, res) return _reduce - + row_mul = make_scalar_op(operator.mul) row_div = make_scalar_op(operator.div) @@ -82,7 +84,7 @@ def transpose(rows): rows = list(rows) if rows: r = len(rows) - c = max([len(x) for x in rows]) + c = max([len(x) for x in rows]) def at(x, y): try: return rows[x][y] @@ -91,14 +93,39 @@ def transpose(rows): for i in xrange(c): yield [at(j, i) for j in xrange(r) ] - def reorder_columns(rows, xchg_pairs): for r in rows: - print type(r) for (x,y) in xchg_pairs: r[x], r[y] = r[y], r[x] yield r +def select_columns(rows, cols): + for r in rows: + yield [r[x] for x in cols] + +def numpy_lstsq(x, y): + from numpy import ones, array + from numpy.linalg import lstsq + A = ones((len(y), 2), dtype=float) + A[:,0] = array(x) + b = array(y) + return lstsq(A, b)[0] + +def max_delta(c0, c1, x, y): + return max([abs(c0 + c1 * float(x) - float(y)) for (x,y) in izip(x, y)]) + +def least_squares(rows, xy_pairs, upper_bound): + cols = [] + for (x, y) in xy_pairs: + cols += [x, y] + rows = select_columns(rows, cols) + cols = transpose(rows) + for ((x, y), (xval, yval)) in izip(xy_pairs, pair_iter(cols)): + c1, c0 = numpy_lstsq(xval, yval) + if upper_bound: + c0 += max_delta(c0, c1, xval, yval) + yield [x + 1, y + 1, c0, c1] + def select_by_key(rows, col, cast=None): by_key = defdict(list) order = [] @@ -115,13 +142,17 @@ class CsvApp(defapp.App): def __init__(self): defapp.App.__init__(self, opts, defaults) # fixup human-friendly offsets - self.options.col -= 1 - if self.options.col_xchg: - self.options.col_xchg = [(x - 1, y - 1) for (x, y) in - self.options.col_xchg] + if not self.options.col: + self.options.col = [1] + self.options.col = [x - 1 for x in self.options.col] + self.options.col_pairs = list(pair_iter(self.options.col)) def transform(self, make_iterator, ordered=True): - """Average all rows with the same key in a given column.""" + """ + Read a file, pass the rows in the file to an iterator factory, and + write out the output of the iterator. The iterator performs the + desired transformation. + """ files = list(self.args) del files[0] for fn in files: @@ -130,7 +161,7 @@ class CsvApp(defapp.App): rows = csv.reader(open(fn, 'r')) # set up transformation if ordered: - (order, by_key) = select_by_key(rows, self.options.col, + (order, by_key) = select_by_key(rows, self.options.col[0], float) rows = make_iterator(order, by_key) else: @@ -152,10 +183,10 @@ class CsvApp(defapp.App): def do_avg(self, _): def fixup_avg(key, rows, res): res = row_div(len(rows), res) - res[self.options.col] = key + res[self.options.col[0]] = key return res self.transform(row_reduce(row_add, fixup_avg)) - + def do_max(self, _): self.transform(row_reduce(row_max)) @@ -166,9 +197,18 @@ class CsvApp(defapp.App): self.transform(transpose, ordered=False) def do_reorder(self, _): - self.transform(lambda rows: reorder_columns(rows, - self.options.col_xchg), - ordered=False) + self.transform(lambda rows: reorder_columns( + rows, self.options.col_pairs), ordered=False) + + def do_select(self, _): + self.transform(lambda rows: select_columns( + rows, self.options.col), ordered=False) + + def do_lstsqrs(self, _): + self.transform( + lambda rows: least_squares(rows, self.options.col_pairs, + self.options.upper_bound), + ordered=False) if __name__ == '__main__': CsvApp().launch() -- cgit v1.2.2