From afbe6a6bf1e7248aa0bdaf506413d604c974002c Mon Sep 17 00:00:00 2001
From: "Bjoern B. Brandenburg" <bbb@cs.unc.edu>
Date: Thu, 25 Sep 2008 17:06:19 -0400
Subject: add support for linear regressions

---
 csv_tool | 90 ++++++++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 65 insertions(+), 25 deletions(-)

(limited to 'csv_tool')

diff --git a/csv_tool b/csv_tool
index e747462..96cd99e 100755
--- a/csv_tool
+++ b/csv_tool
@@ -11,34 +11,36 @@ import csv
 import operator
 import os.path
 from collections import defaultdict as defdict
+from itertools   import izip
 
 o = optparse.make_option
 
 opts = [
 
-    o('-x', '--exchange', action='append', dest='col_xchg',
-      nargs=2, type='int',
-      help='Columns that should be switched with reorder.'),
-
-    o('-c', '--column', action='store', dest='col', type='int',
-      help='The column on which to operate.'),
+    o('-c', '--column', action='append', dest='col', type='int',
+      help='The column(s) on which to operate.'),
 
     o(None, '--write-to-file', action='store_true', dest='write_to_file',
       help='Write the output of operation xyz on file abc.csv to xyz_abc.csv.'),
 
-#     o(None, '--true', action='store_true', dest='truth',
-#       help='A boolean flag value.'),
+    o('-u', '--upper-bound', action='store_true', dest='upper_bound',
+      help="The linear regression is modified to represent an upper bound."),
 
 #     o(None, '--degree', action='store', type='float', dest='thruthiness',
 #       help='Not quite absolut truth.'),
     ]
 
 defaults = {
-    'col'           : 1,
-    'col_xcgh'      : [],
+    'col'           : [],
     'write_to_file' : False,
+    'upper_bound'   : False,
     }
 
+def pair_iter(it):
+    it = iter(it)
+    while True:
+        yield (it.next(), it.next())
+
 def make_vector_op(op):
     def vector_op(a, b, defvalue=0):
         if len(a) > len(b):
@@ -74,7 +76,7 @@ def row_reduce(row_op, fixup=lambda key, rows, res: res):
                 yield fixup(key, rows, res)
     return _reduce
 
-        
+
 row_mul = make_scalar_op(operator.mul)
 row_div = make_scalar_op(operator.div)
 
@@ -82,7 +84,7 @@ def transpose(rows):
     rows = list(rows)
     if rows:
         r = len(rows)
-        c = max([len(x) for x in rows]) 
+        c = max([len(x) for x in rows])
         def at(x, y):
             try:
                 return rows[x][y]
@@ -91,14 +93,39 @@ def transpose(rows):
         for i in xrange(c):
             yield [at(j, i) for j in xrange(r) ]
 
-
 def reorder_columns(rows, xchg_pairs):
     for r in rows:
-        print type(r)
         for (x,y) in xchg_pairs:
             r[x], r[y] = r[y], r[x]
         yield r
 
+def select_columns(rows, cols):
+    for r in rows:
+        yield [r[x] for x in cols]
+
+def numpy_lstsq(x, y):
+    from numpy import ones, array
+    from numpy.linalg import lstsq
+    A = ones((len(y), 2), dtype=float)
+    A[:,0] = array(x)
+    b      = array(y)
+    return lstsq(A, b)[0]
+
+def max_delta(c0, c1, x, y):
+    return max([abs(c0 + c1 * float(x) - float(y)) for (x,y) in izip(x, y)])
+
+def least_squares(rows, xy_pairs, upper_bound):
+    cols = []
+    for (x, y) in xy_pairs:
+        cols += [x, y]
+    rows = select_columns(rows, cols)
+    cols = transpose(rows)
+    for ((x, y), (xval, yval)) in izip(xy_pairs, pair_iter(cols)):
+        c1, c0 = numpy_lstsq(xval, yval)
+        if upper_bound:
+            c0 += max_delta(c0, c1, xval, yval)
+        yield [x + 1, y + 1, c0, c1]
+
 def select_by_key(rows, col, cast=None):
     by_key = defdict(list)
     order  = []
@@ -115,13 +142,17 @@ class CsvApp(defapp.App):
     def __init__(self):
         defapp.App.__init__(self, opts, defaults)
         # fixup human-friendly offsets
-        self.options.col -= 1
-        if self.options.col_xchg:
-            self.options.col_xchg = [(x - 1, y - 1) for (x, y) in
-                                     self.options.col_xchg]
+        if not self.options.col:
+            self.options.col = [1]
+        self.options.col = [x - 1 for x in self.options.col]
+        self.options.col_pairs = list(pair_iter(self.options.col))
 
     def transform(self, make_iterator, ordered=True):
-        """Average all rows with the same key in a given column."""
+        """
+        Read a file, pass the rows in the file to an iterator factory, and
+        write out the output of the iterator. The iterator performs the
+        desired transformation.
+        """
         files = list(self.args)
         del files[0]
         for fn in files:
@@ -130,7 +161,7 @@ class CsvApp(defapp.App):
                 rows = csv.reader(open(fn, 'r'))
                 # set up transformation
                 if ordered:
-                    (order, by_key) = select_by_key(rows, self.options.col,
+                    (order, by_key) = select_by_key(rows, self.options.col[0],
                                                     float)
                     rows = make_iterator(order, by_key)
                 else:
@@ -152,10 +183,10 @@ class CsvApp(defapp.App):
     def do_avg(self, _):
         def fixup_avg(key, rows, res):
             res = row_div(len(rows), res)
-            res[self.options.col] = key
+            res[self.options.col[0]] = key
             return res
         self.transform(row_reduce(row_add, fixup_avg))
-                
+
     def do_max(self, _):
         self.transform(row_reduce(row_max))
 
@@ -166,9 +197,18 @@ class CsvApp(defapp.App):
         self.transform(transpose, ordered=False)
 
     def do_reorder(self, _):
-        self.transform(lambda rows: reorder_columns(rows,
-                                                    self.options.col_xchg),
-                       ordered=False)
+        self.transform(lambda rows: reorder_columns(
+                rows, self.options.col_pairs), ordered=False)
+
+    def do_select(self, _):
+        self.transform(lambda rows: select_columns(
+                rows, self.options.col), ordered=False)
+
+    def do_lstsqrs(self, _):
+        self.transform(
+            lambda rows: least_squares(rows, self.options.col_pairs,
+                                       self.options.upper_bound),
+            ordered=False)
 
 if __name__ == '__main__':
     CsvApp().launch()
-- 
cgit v1.2.2