add support for linear regressions

author: Bjoern B. Brandenburg <bbb@cs.unc.edu> 2008-09-25 17:06:19 -0400
committer: Bjoern B. Brandenburg <bbb@cs.unc.edu> 2008-09-25 17:06:19 -0400
commit: afbe6a6bf1e7248aa0bdaf506413d604c974002c (patch)
tree: c86228c95ea915f534dc881627e6aa297ab55cce /csv_tool
parent: 5e3ea2b647cab744481e1dfa19fa1683bcdbac4b (diff)
1 files changed, 65 insertions, 25 deletions
diff --git a/csv_tool b/csv_tool
index e747462..96cd99e 100755
--- a/csv_tool
+++ b/csv_tool
@@ -11,34 +11,36 @@ import csv
 import operator
 import os.path
 from collections import defaultdict as defdict
+from itertools   import izip
 o = optparse.make_option
 opts = [
-    o('-x', '--exchange', action='append', dest='col_xchg',
+    o('-c', '--column', action='append', dest='col', type='int',
-      nargs=2, type='int',
+      help='The column(s) on which to operate.'),
-      help='Columns that should be switched with reorder.'),
-    o('-c', '--column', action='store', dest='col', type='int',
-      help='The column on which to operate.'),
    o(None, '--write-to-file', action='store_true', dest='write_to_file',
      help='Write the output of operation xyz on file abc.csv to xyz_abc.csv.'),
-#     o(None, '--true', action='store_true', dest='truth',
+    o('-u', '--upper-bound', action='store_true', dest='upper_bound',
-#       help='A boolean flag value.'),
+      help="The linear regression is modified to represent an upper bound."),
 #     o(None, '--degree', action='store', type='float', dest='thruthiness',
 #       help='Not quite absolut truth.'),
    ]
 defaults = {
-    'col'           : 1,
+    'col'           : [],
-    'col_xcgh'      : [],
    'write_to_file' : False,
+    'upper_bound'   : False,
    }
+def pair_iter(it):
+    it = iter(it)
+    while True:
+        yield (it.next(), it.next())
 def make_vector_op(op):
    def vector_op(a, b, defvalue=0):
        if len(a) > len(b):
@@ -74,7 +76,7 @@ def row_reduce(row_op, fixup=lambda key, rows, res: res):
                yield fixup(key, rows, res)
    return _reduce
-        
 row_mul = make_scalar_op(operator.mul)
 row_div = make_scalar_op(operator.div)
@@ -82,7 +84,7 @@ def transpose(rows):
    rows = list(rows)
    if rows:
        r = len(rows)
-        c = max([len(x) for x in rows]) 
+        c = max([len(x) for x in rows])
        def at(x, y):
            try:
                return rows[x][y]
@@ -91,14 +93,39 @@ def transpose(rows):
        for i in xrange(c):
            yield [at(j, i) for j in xrange(r) ]
 def reorder_columns(rows, xchg_pairs):
    for r in rows:
-        print type(r)
        for (x,y) in xchg_pairs:
            r[x], r[y] = r[y], r[x]
        yield r
+def select_columns(rows, cols):
+    for r in rows:
+        yield [r[x] for x in cols]
+def numpy_lstsq(x, y):
+    from numpy import ones, array
+    from numpy.linalg import lstsq
+    A = ones((len(y), 2), dtype=float)
+    A[:,0] = array(x)
+    b      = array(y)
+    return lstsq(A, b)[0]
+def max_delta(c0, c1, x, y):
+    return max([abs(c0 + c1 * float(x) - float(y)) for (x,y) in izip(x, y)])
+def least_squares(rows, xy_pairs, upper_bound):
+    cols = []
+    for (x, y) in xy_pairs:
+        cols += [x, y]
+    rows = select_columns(rows, cols)
+    cols = transpose(rows)
+    for ((x, y), (xval, yval)) in izip(xy_pairs, pair_iter(cols)):
+        c1, c0 = numpy_lstsq(xval, yval)
+        if upper_bound:
+            c0 += max_delta(c0, c1, xval, yval)
+        yield [x + 1, y + 1, c0, c1]
 def select_by_key(rows, col, cast=None):
    by_key = defdict(list)
    order  = []
@@ -115,13 +142,17 @@ class CsvApp(defapp.App):
    def __init__(self):
        defapp.App.__init__(self, opts, defaults)
        # fixup human-friendly offsets
-        self.options.col -= 1
+        if not self.options.col:
-        if self.options.col_xchg:
+            self.options.col = [1]
-            self.options.col_xchg = [(x - 1, y - 1) for (x, y) in
+        self.options.col = [x - 1 for x in self.options.col]
-                                     self.options.col_xchg]
+        self.options.col_pairs = list(pair_iter(self.options.col))
    def transform(self, make_iterator, ordered=True):
-        """Average all rows with the same key in a given column."""
+        """
+        Read a file, pass the rows in the file to an iterator factory, and
+        write out the output of the iterator. The iterator performs the
+        desired transformation.
+        """
        files = list(self.args)
        del files[0]
        for fn in files:
@@ -130,7 +161,7 @@ class CsvApp(defapp.App):
                rows = csv.reader(open(fn, 'r'))
                # set up transformation
                if ordered:
-                    (order, by_key) = select_by_key(rows, self.options.col,
+                    (order, by_key) = select_by_key(rows, self.options.col[0],
                                                    float)
                    rows = make_iterator(order, by_key)
                else:
@@ -152,10 +183,10 @@ class CsvApp(defapp.App):
    def do_avg(self, _):
        def fixup_avg(key, rows, res):
            res = row_div(len(rows), res)
-            res[self.options.col] = key
+            res[self.options.col[0]] = key
            return res
        self.transform(row_reduce(row_add, fixup_avg))
-                
    def do_max(self, _):
        self.transform(row_reduce(row_max))
@@ -166,9 +197,18 @@ class CsvApp(defapp.App):
        self.transform(transpose, ordered=False)
    def do_reorder(self, _):
-        self.transform(lambda rows: reorder_columns(rows,
+        self.transform(lambda rows: reorder_columns(
-                                                    self.options.col_xchg),
+                rows, self.options.col_pairs), ordered=False)
-                       ordered=False)
+    def do_select(self, _):
+        self.transform(lambda rows: select_columns(
+                rows, self.options.col), ordered=False)
+    def do_lstsqrs(self, _):
+        self.transform(
+            lambda rows: least_squares(rows, self.options.col_pairs,
+                                       self.options.upper_bound),
+            ordered=False)
 if __name__ == '__main__':
    CsvApp().launch()
author	Bjoern B. Brandenburg <bbb@cs.unc.edu>	2008-09-25 17:06:19 -0400
committer	Bjoern B. Brandenburg <bbb@cs.unc.edu>	2008-09-25 17:06:19 -0400
commit	afbe6a6bf1e7248aa0bdaf506413d604c974002c (patch)
tree	c86228c95ea915f534dc881627e6aa297ab55cce /csv_tool
parent	5e3ea2b647cab744481e1dfa19fa1683bcdbac4b (diff)

diff --git a/csv_tool b/csv_tool index e747462..96cd99e 100755 --- a/csv_tool +++ b/csv_tool
@@ -11,34 +11,36 @@ import csv
11	import operator	11	import operator
12	import os.path	12	import os.path
13	from collections import defaultdict as defdict	13	from collections import defaultdict as defdict
		14	from itertools import izip
14		15
15	o = optparse.make_option	16	o = optparse.make_option
16		17
17	opts = [	18	opts = [
18		19
19	o('-x', '--exchange', action='append', dest='col_xchg',	20	o('-c', '--column', action='append', dest='col', type='int',
20	nargs=2, type='int',	21	help='The column(s) on which to operate.'),
21	help='Columns that should be switched with reorder.'),
22
23	o('-c', '--column', action='store', dest='col', type='int',
24	help='The column on which to operate.'),
25		22
26	o(None, '--write-to-file', action='store_true', dest='write_to_file',	23	o(None, '--write-to-file', action='store_true', dest='write_to_file',
27	help='Write the output of operation xyz on file abc.csv to xyz_abc.csv.'),	24	help='Write the output of operation xyz on file abc.csv to xyz_abc.csv.'),
28		25
29	# o(None, '--true', action='store_true', dest='truth',	26	o('-u', '--upper-bound', action='store_true', dest='upper_bound',
30	# help='A boolean flag value.'),	27	help="The linear regression is modified to represent an upper bound."),
31		28
32	# o(None, '--degree', action='store', type='float', dest='thruthiness',	29	# o(None, '--degree', action='store', type='float', dest='thruthiness',
33	# help='Not quite absolut truth.'),	30	# help='Not quite absolut truth.'),
34	]	31	]
35		32
36	defaults = {	33	defaults = {
37	'col' : 1,	34	'col' : [],
38	'col_xcgh' : [],
39	'write_to_file' : False,	35	'write_to_file' : False,
		36	'upper_bound' : False,
40	}	37	}
41		38
		39	def pair_iter(it):
		40	it = iter(it)
		41	while True:
		42	yield (it.next(), it.next())
		43
42	def make_vector_op(op):	44	def make_vector_op(op):
43	def vector_op(a, b, defvalue=0):	45	def vector_op(a, b, defvalue=0):
44	if len(a) > len(b):	46	if len(a) > len(b):
@@ -74,7 +76,7 @@ def row_reduce(row_op, fixup=lambda key, rows, res: res):
74	yield fixup(key, rows, res)	76	yield fixup(key, rows, res)
75	return _reduce	77	return _reduce
76		78
77		79
78	row_mul = make_scalar_op(operator.mul)	80	row_mul = make_scalar_op(operator.mul)
79	row_div = make_scalar_op(operator.div)	81	row_div = make_scalar_op(operator.div)
80		82
@@ -82,7 +84,7 @@ def transpose(rows):
82	rows = list(rows)	84	rows = list(rows)
83	if rows:	85	if rows:
84	r = len(rows)	86	r = len(rows)
85	c = max([len(x) for x in rows])	87	c = max([len(x) for x in rows])
86	def at(x, y):	88	def at(x, y):
87	try:	89	try:
88	return rows[x][y]	90	return rows[x][y]
@@ -91,14 +93,39 @@ def transpose(rows):
91	for i in xrange(c):	93	for i in xrange(c):
92	yield [at(j, i) for j in xrange(r) ]	94	yield [at(j, i) for j in xrange(r) ]
93		95
94
95	def reorder_columns(rows, xchg_pairs):	96	def reorder_columns(rows, xchg_pairs):
96	for r in rows:	97	for r in rows:
97	print type(r)
98	for (x,y) in xchg_pairs:	98	for (x,y) in xchg_pairs:
99	r[x], r[y] = r[y], r[x]	99	r[x], r[y] = r[y], r[x]
100	yield r	100	yield r
101		101
		102	def select_columns(rows, cols):
		103	for r in rows:
		104	yield [r[x] for x in cols]
		105
		106	def numpy_lstsq(x, y):
		107	from numpy import ones, array
		108	from numpy.linalg import lstsq
		109	A = ones((len(y), 2), dtype=float)
		110	A[:,0] = array(x)
		111	b = array(y)
		112	return lstsq(A, b)[0]
		113
		114	def max_delta(c0, c1, x, y):
		115	return max([abs(c0 + c1 * float(x) - float(y)) for (x,y) in izip(x, y)])
		116
		117	def least_squares(rows, xy_pairs, upper_bound):
		118	cols = []
		119	for (x, y) in xy_pairs:
		120	cols += [x, y]
		121	rows = select_columns(rows, cols)
		122	cols = transpose(rows)
		123	for ((x, y), (xval, yval)) in izip(xy_pairs, pair_iter(cols)):
		124	c1, c0 = numpy_lstsq(xval, yval)
		125	if upper_bound:
		126	c0 += max_delta(c0, c1, xval, yval)
		127	yield [x + 1, y + 1, c0, c1]
		128
102	def select_by_key(rows, col, cast=None):	129	def select_by_key(rows, col, cast=None):
103	by_key = defdict(list)	130	by_key = defdict(list)
104	order = []	131	order = []
@@ -115,13 +142,17 @@ class CsvApp(defapp.App):
115	def __init__(self):	142	def __init__(self):
116	defapp.App.__init__(self, opts, defaults)	143	defapp.App.__init__(self, opts, defaults)
117	# fixup human-friendly offsets	144	# fixup human-friendly offsets
118	self.options.col -= 1	145	if not self.options.col:
119	if self.options.col_xchg:	146	self.options.col = [1]
120	self.options.col_xchg = [(x - 1, y - 1) for (x, y) in	147	self.options.col = [x - 1 for x in self.options.col]
121	self.options.col_xchg]	148	self.options.col_pairs = list(pair_iter(self.options.col))
122		149
123	def transform(self, make_iterator, ordered=True):	150	def transform(self, make_iterator, ordered=True):
124	"""Average all rows with the same key in a given column."""	151	"""
		152	Read a file, pass the rows in the file to an iterator factory, and
		153	write out the output of the iterator. The iterator performs the
		154	desired transformation.
		155	"""
125	files = list(self.args)	156	files = list(self.args)
126	del files[0]	157	del files[0]
127	for fn in files:	158	for fn in files:
@@ -130,7 +161,7 @@ class CsvApp(defapp.App):
130	rows = csv.reader(open(fn, 'r'))	161	rows = csv.reader(open(fn, 'r'))
131	# set up transformation	162	# set up transformation
132	if ordered:	163	if ordered:
133	(order, by_key) = select_by_key(rows, self.options.col,	164	(order, by_key) = select_by_key(rows, self.options.col[0],
134	float)	165	float)
135	rows = make_iterator(order, by_key)	166	rows = make_iterator(order, by_key)
136	else:	167	else:
@@ -152,10 +183,10 @@ class CsvApp(defapp.App):
152	def do_avg(self, _):	183	def do_avg(self, _):
153	def fixup_avg(key, rows, res):	184	def fixup_avg(key, rows, res):
154	res = row_div(len(rows), res)	185	res = row_div(len(rows), res)
155	res[self.options.col] = key	186	res[self.options.col[0]] = key
156	return res	187	return res
157	self.transform(row_reduce(row_add, fixup_avg))	188	self.transform(row_reduce(row_add, fixup_avg))
158		189
159	def do_max(self, _):	190	def do_max(self, _):
160	self.transform(row_reduce(row_max))	191	self.transform(row_reduce(row_max))
161		192
@@ -166,9 +197,18 @@ class CsvApp(defapp.App):
166	self.transform(transpose, ordered=False)	197	self.transform(transpose, ordered=False)
167		198
168	def do_reorder(self, _):	199	def do_reorder(self, _):
169	self.transform(lambda rows: reorder_columns(rows,	200	self.transform(lambda rows: reorder_columns(
170	self.options.col_xchg),	201	rows, self.options.col_pairs), ordered=False)
171	ordered=False)	202
		203	def do_select(self, _):
		204	self.transform(lambda rows: select_columns(
		205	rows, self.options.col), ordered=False)
		206
		207	def do_lstsqrs(self, _):
		208	self.transform(
		209	lambda rows: least_squares(rows, self.options.col_pairs,
		210	self.options.upper_bound),
		211	ordered=False)
172		212
173	if __name__ == '__main__':	213	if __name__ == '__main__':
174	CsvApp().launch()	214	CsvApp().launch()