summaryrefslogtreecommitdiffstats
path: root/data_analysis/statanalyzer.py
diff options
context:
space:
mode:
authorAndrea Bastoni <bastoni@cs.unc.edu>2010-04-12 23:23:50 -0400
committerAndrea Bastoni <bastoni@cs.unc.edu>2010-04-12 23:23:50 -0400
commit3b21f9d15822453117d1e908ab97cacd8f7f39be (patch)
tree09b0c519d11975fe5e1d364ff3174fa358407177 /data_analysis/statanalyzer.py
parent27a6355f1d25bdf8a68bea6e19cc283a943753da (diff)
Add IQR outliers removal
- this code is ugly C style code "transformed" in Python should be reworked at some point...
Diffstat (limited to 'data_analysis/statanalyzer.py')
-rw-r--r--data_analysis/statanalyzer.py56
1 files changed, 56 insertions, 0 deletions
diff --git a/data_analysis/statanalyzer.py b/data_analysis/statanalyzer.py
new file mode 100644
index 0000000..6e096c4
--- /dev/null
+++ b/data_analysis/statanalyzer.py
@@ -0,0 +1,56 @@
1#!/usr/bin/env python
2
3import numpy as np
4from scipy import stats
5
6class InterQuartileRange:
7 def __init__(self, low, high, extend = False):
8 self.low = low
9 self.high = high
10 # extend is 1.5 extension of IQR
11 self.extend = extend
12
13 def remOutliers(self, vector):
14 svect = np.sort(vector)
15 q1 = stats.scoreatpercentile(svect, self.low)
16 q3 = stats.scoreatpercentile(svect, self.high)
17
18 # match the values \in svect which are closer to q[1|3]
19 # (q1, q3)
20 q1_pos = -1
21 q3_pos = -1
22 cur_pos = 0
23 for i in svect:
24 if q1_pos != -1 and q3_pos != -1:
25 break
26 if q1_pos == -1 and i > q1:
27 q1_pos = cur_pos
28 if q3_pos == -1 and q3 < i:
29 q3_pos = cur_pos
30
31 cur_pos += 1
32
33 if self.extend == True:
34 # 1.5 IQR outliers elimination
35 eiqr = (svect[q3_pos] - svect[q1_pos]) * 1.5
36 eq1 = svect[q1_pos] - eiqr
37 if eq1 < svect[0]:
38 eq1 = svect[0]
39 eq3 = svect[q3_pos] + eiqr
40 if eq3 > svect[len(svect) - 1]:
41 eq3 = svect[len(svect) - 1]
42 # match the values \in svect which are closer to eq[1|3]
43 q1_pos = -1
44 q3_pos = -1
45 cur_pos = 0
46 for i in svect:
47 if q1_pos != -1 and q3_pos != -1:
48 break
49 if q1_pos == -1 and i > eq1:
50 q1_pos = cur_pos
51 if q3_pos == -1 and eq3 < i:
52 q3_pos = cur_pos
53
54 cur_pos += 1
55
56 return svect[q1_pos : q3_pos]